In [1]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from string import punctuation
custom = stop_words+list(punctuation)
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("Consumer_Complaints.csv",encoding = 'ISO-8859-1')
df.head()
df = df.drop(['Unnamed: 18'],1)

### Data information

In [3]:
# Information

print('Dataframe shape:\n',df.shape,'\n\n')
print('Info:\n',df.info(),'\n','\n')
print('Class counts:\n\n',df['Product'].value_counts(),'\n\n')
print('Null Values\n\n',df.isnull().sum(),'\n\n')
print('Columns:\n',df.columns)

Dataframe shape:
 (1025010, 18) 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025010 entries, 0 to 1025009
Data columns (total 18 columns):
Date received                   1025010 non-null object
Product                         1025010 non-null object
Sub-product                     789840 non-null object
Issue                           1025010 non-null object
Sub-issue                       528853 non-null object
Consumer Complaint              277814 non-null object
Company Public Response         318364 non-null object
Company                         1025010 non-null object
State                           1012650 non-null object
ZIP code                        1008292 non-null object
Tags                            141588 non-null object
Consumer consent provided?      491911 non-null object
Submitted via                   1025010 non-null object
Date Sent to Company            1025010 non-null object
Company Response to Consumer    1025007 non-null object
Timely response?  

In [4]:
# tokenizer function

def my_tokenizer(s):
    try:
        s = s.lower() # downcase
    except:
        s = str(s).lower()
    tokens = nltk.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in custom] # remove stopwords
    tokens = [t for t in tokens if not any(c.isdigit() for c in t)] # remove any digits, i.e. "3rd edition"
    return tokens

### Data Preprocess

In [8]:
def Preprocessing(df1, column_1, column_2):    #column_1 - reviews/msgs/complaints, column_2 - class/sentiments/Products
    
    # Remove Null values
    df1.dropna(inplace = True)
    df1 = df1[[column_1, column_2]]
    
    # Convert to list and tokenize
    text = df1[column_1].tolist()
    cleaned_text = []
    for x in text:
        cleaned_text.append(my_tokenizer(x))
    joined_text = []
    for x in cleaned_text:
        joined_text.append(' '.join(x))
        
    #Create New Dataframe    
    df2 = pd.DataFrame({column_1:joined_text}) #mention column_1
    df2[column_2] = pd.Series(df1[column_2].tolist())
    df2[column_2] = df2[column_2].str.lower() # mention column_2
    
    #check if the dataframe is proper
    print(df2.head())
    
    
    # Information

    print('Dataframe shape:\n',df2.shape,'\n\n')
    print('Info:\n',df2.info(),'\n','\n')
    print('Class counts:\n\n',df2['Product'].value_counts(),'\n\n')
    print('Null Values\n\n',df2.isnull().sum(),'\n\n')
    print('Columns:\n',df2.columns,'\n\n')
    
    # TF-IDF
    tfidf = TfidfVectorizer()
    vector = tfidf.fit_transform(df2[column_1])
    vector_values_array = vector.toarray()
    X = vector_values_array
    
    
    # Label Encoding
    le = LabelEncoder()
    y = le.fit_transform(df2[column_2])
    
    #Train & Test split
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
    
    
    return X_train,X_test,y_train,y_test

### Train and Test

In [9]:
X_train,X_test,y_train,y_test = Preprocessing(df,'Consumer Complaint','Product')

print('X_train:',X_train.shape)
print('y_train:',y_train.shape)
print('X_test:',X_test.shape)
print('y_test:',y_test.shape)

                                  Consumer Complaint          Product
0  wa called cell phone first national debt inc. ...  debt collection
1  sent cease desist letter medical debt collecti...  debt collection
2  phoenix financial service llc continues report...  debt collection
3  broke rib fishing trip xxxx xxxx wa taken loca...  debt collection
4  name xxxx xxxx xxxx company reported acct xxxx...  debt collection
Dataframe shape:
 (3105, 2) 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3105 entries, 0 to 3104
Data columns (total 2 columns):
Consumer Complaint    3105 non-null object
Product               3105 non-null object
dtypes: object(2)
memory usage: 48.6+ KB
Info:
 None 
 

Class counts:

 debt collection    2969
student loan        136
Name: Product, dtype: int64 


Null Values

 Consumer Complaint    0
Product               0
dtype: int64 


Columns:
 Index(['Consumer Complaint', 'Product'], dtype='object') 


X_train: (2484, 8723)
y_train: (2484,)
X_test: (621, 8723

### Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score, classification_report


# LogisticRegression
logit = LogisticRegression()
logit.fit(X_train,y_train)
logitc_acc = logit.score(X_test,y_test)
logit_d = pd.DataFrame(report).transpose().iloc[0:2,0:3]

# MultinomialNB
mul = MultinomialNB()
mul.fit(X_train, y_train)
mul_acc = mul.score(X_test,y_test)
mul_d = pd.DataFrame(report).transpose().iloc[0:2,0:3]

# SVC
svc = SVC()
svc.fit(X_train, y_train)
svc_acc = svc.score(X_test,y_test)
svc_d = pd.DataFrame(report).transpose().iloc[0:2,0:3]

# RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc_acc = rfc.score(X_test,y_test)
rfc_d = pd.DataFrame(report).transpose().iloc[0:2,0:3]


# xgboost
xg = xgboost.XGBClassifier()
xg.fit(X_train,y_train)
xg_acc = xg.score(X_train,y_train)
xg_d = pd.DataFrame(report).transpose().iloc[0:2,0:3]


#AdaBoostClassifier
ada = AdaBoostClassifier()
ada.fit(X_train,y_train)
ada_acc = ada.score(X_test,y_test)
ada_d = pd.DataFrame(report).transpose().iloc[0:2,0:3]





### Models Report

In [None]:
Model_data = pd.concat([logit_d,mul_d,svc_d,rfc_d,xg_d,ada_d])
Model_data['Models'] = ['LogisticRegression','LogisticRegression','MultinomialNB','MultinomialNB','SVC','SVC','RandomForestClassifier'
                        ,'RandomForestClassifier','xgboost','xgboost','AdaBoostClassifier','AdaBoostClassifier']
Model_data['accuracy'] = [logitc_acc,logitc_acc,mul_acc,mul_acc,svc_acc,svc_acc,rfc_acc,rfc_acc,xg_acc,xg_acc,ada_acc,ada_acc]
Model_data = Model_data[['Models','accuracy','precision','recall','f1-score']]
Model_data.index.name = 'Product'

In [None]:
Model_data