#                                                Objective
You need to create an API endpoint that can accept a text and return associated sentiment with it. 


In [50]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix , classification_report
from sklearn.model_selection import train_test_split

df = pd.read_csv('airline_sentiment_analysis.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,airline_sentiment,text
0,1,positive,@VirginAmerica plus you've added commercials t...
1,3,negative,@VirginAmerica it's really aggressive to blast...
2,4,negative,@VirginAmerica and it's a really big bad thing...
3,5,negative,@VirginAmerica seriously would pay $30 a fligh...
4,6,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [51]:
df = df.loc[:,df.columns!="Unnamed: 0"]
df.head()

Unnamed: 0,airline_sentiment,text
0,positive,@VirginAmerica plus you've added commercials t...
1,negative,@VirginAmerica it's really aggressive to blast...
2,negative,@VirginAmerica and it's a really big bad thing...
3,negative,@VirginAmerica seriously would pay $30 a fligh...
4,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [52]:
#check for null values
df.isnull().sum() 

airline_sentiment    0
text                 0
dtype: int64

In [53]:
# check for class imbalance
df['airline_sentiment'].value_counts()

negative    9178
positive    2363
Name: airline_sentiment, dtype: int64

# Text Pre-processing

In [54]:


#import nltk
#nltk.download('punkt')
#nltk.download('stopwords')

In [55]:
import nltk

import re

# remove some stopwords to capture negation in n-grams if possible
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('not')
stop_words.remove('but')



def simple_text_preprocessor(document): 
    # lower case
    document = str(document).lower()
    
    
    # remove unnecessary characters
    document = re.sub(r'[^a-zA-Z]',r' ', document)
    document = re.sub(r'nbsp', r'', document)
    document = re.sub(' +', ' ', document)
    

    
    # stopwords removal
    document = ' '.join([word for word in document.split() if word not in stop_words])
    
    return document

stp = np.vectorize(simple_text_preprocessor)

In [56]:
df['text'] = stp(df['text'].values)

df.head()

Unnamed: 0,airline_sentiment,text
0,positive,virginamerica plus added commercials experienc...
1,negative,virginamerica really aggressive blast obnoxiou...
2,negative,virginamerica really big bad thing
3,negative,virginamerica seriously would pay flight seats...
4,positive,virginamerica yes nearly every time fly vx ear...


In [57]:

X = df['text']
y = df['airline_sentiment']
X.head()

0    virginamerica plus added commercials experienc...
1    virginamerica really aggressive blast obnoxiou...
2                   virginamerica really big bad thing
3    virginamerica seriously would pay flight seats...
4    virginamerica yes nearly every time fly vx ear...
Name: text, dtype: object

In [58]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [59]:

from collections import Counter
Counter(y_train), Counter(y_test)

(Counter({'positive': 1800, 'negative': 6855}),
 Counter({'negative': 2323, 'positive': 563}))

In [60]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer



# testing different models

1 model using linear svc

In [61]:

from sklearn.svm import LinearSVC
model = LinearSVC()
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('md',model),])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)


Pipeline(steps=[('tfidf', TfidfVectorizer()), ('md', LinearSVC())])

In [62]:
predictions = text_clf.predict(X_test)
print(classification_report(y_test,predictions))
 

              precision    recall  f1-score   support

    negative       0.94      0.97      0.96      2323
    positive       0.86      0.75      0.80       563

    accuracy                           0.93      2886
   macro avg       0.90      0.86      0.88      2886
weighted avg       0.92      0.93      0.93      2886



2 model LR

In [63]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('md',model),])
text_clf.fit(X_train, y_train)




Pipeline(steps=[('tfidf', TfidfVectorizer()), ('md', LogisticRegression())])

In [64]:
predictions = text_clf.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    negative       0.91      0.98      0.95      2323
    positive       0.90      0.60      0.72       563

    accuracy                           0.91      2886
   macro avg       0.90      0.79      0.83      2886
weighted avg       0.91      0.91      0.90      2886



3 MODEL

In [65]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('md',model),])
text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    negative       0.91      0.90      0.91      2323
    positive       0.62      0.63      0.62       563

    accuracy                           0.85      2886
   macro avg       0.76      0.77      0.77      2886
weighted avg       0.85      0.85      0.85      2886



4 model

In [66]:
from sklearn import tree
clftree = tree.DecisionTreeClassifier()
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(base_estimator=clftree, n_estimators=100,
                            bootstrap=True, n_jobs=-1,
                            random_state=42)
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('md',model),])
text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)
print(classification_report(y_test,predictions))



              precision    recall  f1-score   support

    negative       0.91      0.95      0.93      2323
    positive       0.75      0.62      0.68       563

    accuracy                           0.89      2886
   macro avg       0.83      0.79      0.81      2886
weighted avg       0.88      0.89      0.88      2886



5 model

In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
model= RandomForestClassifier(n_estimators=250,random_state=42)

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('md',model),])
text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)
print(classification_report(y_test,predictions))


              precision    recall  f1-score   support

    negative       0.91      0.98      0.94      2323
    positive       0.89      0.61      0.72       563

    accuracy                           0.91      2886
   macro avg       0.90      0.79      0.83      2886
weighted avg       0.91      0.91      0.90      2886



6 model

In [68]:
from sklearn.svm import SVC
model = SVC()
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('md',model),])
text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)
print(classification_report(y_test,predictions))


              precision    recall  f1-score   support

    negative       0.92      0.98      0.95      2323
    positive       0.91      0.64      0.75       563

    accuracy                           0.92      2886
   macro avg       0.91      0.81      0.85      2886
weighted avg       0.92      0.92      0.91      2886



7 model

In [69]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('md',model),])
text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    negative       0.93      0.93      0.93      2323
    positive       0.69      0.69      0.69       563

    accuracy                           0.88      2886
   macro avg       0.81      0.81      0.81      2886
weighted avg       0.88      0.88      0.88      2886



8 model

In [70]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('md',model),])
text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    negative       0.89      0.98      0.93      2323
    positive       0.85      0.51      0.64       563

    accuracy                           0.89      2886
   macro avg       0.87      0.74      0.79      2886
weighted avg       0.88      0.89      0.88      2886



9 model

In [71]:
from sklearn.linear_model import SGDClassifier
model = SGDClassifier()

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('md',model),])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)
print(classification_report(y_test,predictions))



              precision    recall  f1-score   support

    negative       0.94      0.97      0.96      2323
    positive       0.87      0.74      0.80       563

    accuracy                           0.93      2886
   macro avg       0.90      0.86      0.88      2886
weighted avg       0.93      0.93      0.93      2886



# linear svc is best among all with highest accuracy and f1 score among all

In [72]:
# let resample train data to overcome class imbalance

In [73]:

from sklearn.utils import resample

# concatenate training data back together
train_data = pd.concat([X_train, y_train], axis = 1)

# separate minority and majority class
positive_sentiment_data = train_data[train_data.airline_sentiment=='positive']
negative_sentiment_data = train_data[train_data.airline_sentiment=='negative']

# Unsample minority; we are oversampling the minority class to match the number of majority classs
positive_upsampled = resample(positive_sentiment_data,
                           replace = True, # Sample with replacement
                           n_samples = len(negative_sentiment_data), # Match number in majority class
                           random_state=27)


In [74]:

# combine majority and upsampled minority
upsampled = pd.concat([negative_sentiment_data, positive_upsampled])
upsampled.head()

Unnamed: 0,text,airline_sentiment
10146,americanair told could refund cost original re...,negative
9803,americanair everyone else outstanding,negative
1103,united ua denver austin still ground,negative
9482,americanair issue lack consideration announcem...,negative
4161,southwestair hold min trying rebook flight can...,negative


In [75]:
# Now let's check the classes count
upsampled.airline_sentiment.value_counts()

negative    6855
positive    6855
Name: airline_sentiment, dtype: int64

In [76]:
X_new = upsampled['text']
Y_new = upsampled['airline_sentiment']

In [77]:
# now lets train again using linear svc
model = LinearSVC()
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('md',model),])

# Feed the training data through the pipeline
text_clf.fit(X_new, Y_new)
predictions = text_clf.predict(X_test)
print(classification_report(y_test,predictions))
 


              precision    recall  f1-score   support

    negative       0.95      0.94      0.95      2323
    positive       0.78      0.81      0.79       563

    accuracy                           0.92      2886
   macro avg       0.87      0.88      0.87      2886
weighted avg       0.92      0.92      0.92      2886



In [78]:
# on oversampling it seem that model lead to overfit which result slightly bad result so let revert back to original model
# now lets train again using linear svc with changing parameter
model = LinearSVC(penalty='l1',dual = False)
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('md',model),])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    negative       0.94      0.96      0.95      2323
    positive       0.83      0.74      0.78       563

    accuracy                           0.92      2886
   macro avg       0.88      0.85      0.87      2886
weighted avg       0.92      0.92      0.92      2886



In [79]:
# Coming backk to best model
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('L_svc', LinearSVC()),])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

    negative       0.94      0.97      0.96      2323
    positive       0.86      0.75      0.80       563

    accuracy                           0.93      2886
   macro avg       0.90      0.86      0.88      2886
weighted avg       0.92      0.93      0.93      2886



# Saving the trained model

In [80]:
import joblib
  
joblib_file = "job_lib_nlp_model.pk"
joblib.dump(text_clf,joblib_file)

['job_lib_nlp_model.pk']