# Model Building

 This notebook consists of  different classification models to get better performance metrics for the model deployment

### Importing Packages

In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
# Build Pipeline
import joblib
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

### Loading the cleaned data

In [None]:
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,sentiment,cleaned_content
0,joy,sage act upgrade list tommorow
1,sadness,way homegirl baby funeral man hate funeral sho...
2,joy,eye true hazel eyeand brilliant regular featur...
3,joy,ugh babe hugggzzz babe naamaze nga ako babe de...
4,fear,-PRON- be expect extremely important phonecall...


In [None]:
df.shape

(30631, 2)

### Missing values

In [None]:
df.isnull().sum()

sentiment           0
cleaned_content    14
dtype: int64

In [None]:
df.dropna(axis=0,inplace=True)

In [None]:
df['sentiment'].value_counts(sort = True)

joy         10395
sadness      6128
fear         4201
surprise     4008
shame        3882
neutral      1149
disgust       854
Name: sentiment, dtype: int64

### Baseline Model

Lets build a quick model using TfidfVectorizer and Logistic Regression

In [None]:
# Features & Labels
x_data = df['cleaned_content']
y_data = df['sentiment']

In [None]:
#  Split Data
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=0.3,random_state=42,stratify = y_data)

In [None]:
# LogisticRegression Pipeline
pipeline_logreg = Pipeline(steps=[('tfidf',TfidfVectorizer(analyzer='word',strip_accents='unicode',ngram_range=(1, 2))),('lr',LogisticRegression(max_iter=200))])

The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters

In [None]:
# Train and Fit Data
pipeline_logreg.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents='unicode',
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('lr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_sca

In [None]:
# Check Accuracy and f1-score
y_pred=pipeline_logreg.predict(x_test)
print(pipeline_logreg.score(x_test,y_test))
print(classification_report(y_test,y_pred))

0.5666231221423906
              precision    recall  f1-score   support

     disgust       0.58      0.03      0.05       256
        fear       0.70      0.48      0.57      1260
         joy       0.54      0.84      0.66      3119
     neutral       0.54      0.08      0.13       345
     sadness       0.54      0.54      0.54      1839
       shame       0.60      0.45      0.51      1165
    surprise       0.61      0.36      0.45      1202

    accuracy                           0.57      9186
   macro avg       0.59      0.40      0.42      9186
weighted avg       0.58      0.57      0.54      9186



### Label Encoding on Sentiment data

Label Encoding is a popular encoding technique for handling categorical variables. In this technique, each label is assigned a unique integer based on alphabetical ordering.


+ 0 -> disgust
+ 1 -> fear
+ 2 -> joy
+ 3 -> neutral
+ 4 -> sad
+ 5 -> shame
+ 6 -> surprise


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])

In [None]:
df.head()

Unnamed: 0,sentiment,cleaned_content
0,2,sage act upgrade list tommorow
1,4,way homegirl baby funeral man hate funeral sho...
2,2,eye true hazel eyeand brilliant regular featur...
3,2,ugh babe hugggzzz babe naamaze nga ako babe de...
4,1,-PRON- be expect extremely important phonecall...


### TF-IDF

TF-IDF (term frequency-inverse document frequency) is a statistical measure that evaluates how relevant a word is to a document in a collection of documents.

The TfidfVectorizer will tokenize documents, learn the vocabulary and inverse document frequency weightings, and allow you to encode new documents

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, random_state=42, test_size=0.30, shuffle=True)

print(train.shape)
print(test.shape)

(21431, 2)
(9186, 2)


In [None]:
train_text = train['cleaned_content']
test_text = test['cleaned_content']

In [None]:
# from sklearn.model_selection import train_test_split
# train, test = train_test_split(text_data, random_state=42, test_size=0.30, shuffle=True )
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['cleaned_content'], axis=1)
x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['cleaned_content'], axis=1)

### Handling the Imbalanced dataset using SMOTE (Synthetic Minority OverSampling Techniques)

SMOTE (synthetic minority oversampling technique) is one of the most commonly used oversampling methods to solve the imbalance problem. It aims to balance class distribution by randomly increasing minority class examples by replicating them. SMOTE synthesises new minority instances between existing minority instances.

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X, y = oversample.fit_resample(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [None]:
X.shape, y.shape

((50953, 116985), (50953,))

In [None]:
#1 Model - Logistic Regression
m1 = LogisticRegression()
m1.fit(X, y)
pred1 = m1.predict(x_test)
print(classification_report(y_test, pred1))

              precision    recall  f1-score   support

           0       0.41      0.16      0.23       259
           1       0.62      0.54      0.58      1244
           2       0.77      0.48      0.59      3116
           3       0.09      0.63      0.16       321
           4       0.53      0.49      0.51      1869
           5       0.46      0.57      0.51      1197
           6       0.61      0.34      0.44      1180

    accuracy                           0.48      9186
   macro avg       0.50      0.46      0.43      9186
weighted avg       0.61      0.48      0.52      9186



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred=classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[  39   24   17   60   55   50   14]
 [  10  635   72  222  165  107   33]
 [  18  188 1168  789  534  292  127]
 [   2    6   28  223   39   18    5]
 [   9  103  175  381  931  218   52]
 [   9   68   60  215  208  614   23]
 [   7   73  141  276  167   86  430]]
0.43979969518833006
              precision    recall  f1-score   support

           0       0.41      0.15      0.22       259
           1       0.58      0.51      0.54      1244
           2       0.70      0.37      0.49      3116
           3       0.10      0.69      0.18       321
           4       0.44      0.50      0.47      1869
           5       0.44      0.51      0.48      1197
           6       0.63      0.36      0.46      1180

    accuracy                           0.44      9186
   macro avg       0.47      0.44      0.41      9186
weighted avg       0.56      0.44      0.47      9186



### Models and their f1 score

+ Pipeline(TfidfVectorizer and Logistic Regression) -> 0.54
+ TfidfVectorizer and Logistic Regression with Oversampling -> 0.52
+ RandomForestClassifier with Oversampling ->0.47

Even after oversampling the data the performance is not that better than base model.

So I am dumping the base model into pickle.

In [None]:
# Save the model with pickle
model = open("text_model.pkl","wb")
joblib.dump(pipeline_logreg,model)
model.close()