In [None]:
import pandas as pd
import numpy as np
import gensim.downloader as api

In [None]:
df = pd.read_csv("complaints_processed.csv")

In [None]:
df.head()

In [None]:
df['product'].value_counts()

In [None]:
df.shape

In [None]:
df = df.drop(['Unnamed: 0'],axis=1)

In [None]:
df.shape

In [None]:
word2Vec = api.load('word2vec-google-news-300')

In [None]:
word = word2Vec.get_mean_vector(["hello" ,"all" ,"the word"],pre_normalize=True) 
word

In [None]:
df.shape

In [None]:
df['narrative'].isnull().sum()

In [None]:
df['narrative'].replace('', np.nan)

In [None]:
df['narrative'].isnull().sum()

In [None]:
# remove the nan rows
df= df.dropna(subset=['narrative'])

In [None]:
df.shape

In [None]:
df['narrative'].isnull().sum()

In [None]:
# remove the duplicate rows 
df = df.drop_duplicates() 
df.shape

In [None]:
df_new = df.iloc[:5000]

In [None]:
df_new['narrative'].replace('', np.nan)

In [None]:
df_new['narrative'].isnull().sum()

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
def preprocess_vectorize(text):
    words = tokenizer.tokenize(text)
    words_lower = [word.lower() for word in words]
    tokens=[]
    for token in words_lower:
        if token in stop_words:
            continue
        if token in word2Vec.key_to_index: 
            tokens.append(token)
    if tokens:
        return word2Vec.get_mean_vector(tokens)
    else: 
        return np.nan

In [None]:

df_new['vector'] = df_new['narrative'].apply(lambda narrative: preprocess_vectorize(narrative))


In [None]:
df_new['product_num'] = df_new['product'].map({
    'credit_card': 0 , 
    'retail_banking': 1 , 
    'credit_reporting': 2 , 
    'mortgages_and_loans': 3 , 
    'debt_collection': 4 
})

In [None]:
df_new['product'].value_counts()

In [None]:
df_new["vector"][4000].shape

In [None]:
x = df_new['vector']
y = df_new['product_num']

In [None]:
x = np.stack(x)
y = np.stack(y)

In [None]:
x.shape
y.shape

In [None]:
from imblearn.over_sampling import SMOTE
x_smote, y_smote = SMOTE().fit_resample(x , y)

In [None]:
x_smote.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    x_smote, 
    y_smote, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=42,
    stratify = y_smote
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier , VotingClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
names = [
    Z,
    "voiting"
]

classifiers = [
    SVC(),
    RandomForestClassifier(),
    MLPClassifier(),
    VotingClassifier(estimators=[
        ('rbfsvm',SVC(probability=True)),
        ('rf',RandomForestClassifier()),
        ('nu', MLPClassifier())
        ],
        voting='soft')
]


In [None]:
for name, clf in zip(names, classifiers):
        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        pred = clf.predict(X_test)
        f1 = f1_score(y_test, pred, average='weighted')

        print(f'classifier: {name} | the score is: {score} | the f1 is: {f1} ')