### NLP Sentiment Analysis Exercise

In [139]:
# import numpy 
import numpy as np 

# import pandas
import pandas as pd 

# import regex
import re

# import nltk
import nltk 

import warnings
warnings.filterwarnings('ignore')

In [151]:
# load data
data_source_url = "Tweets.csv"
airline_tweets = pd.read_csv(data_source_url)

In [152]:
airline_tweets.shape

(14640, 15)

In [153]:
df = airline_tweets[['airline_sentiment', 'text']]

In [154]:
features = np.array(df.text)

In [184]:
y = np.array(df.airline_sentiment)

**Task:** Clean the text data in the `'features'` array.

    - Remove all the special characters.
    - Remove all single characters.
    - Remove single characters from the start.
    - Substituting multiple spaces with single space.
    - Converting all text to lowercase.

In [160]:
import string
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

In [156]:
def tokenize(text):
    tokens = text.split()
    return tokens

In [157]:
# Import the NLTK package and download the necessary data
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
ENGstopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [158]:
# Define a function to remove all stopwords
def remove_stopwords(tokenized_text):    
    text = [word for word in tokenized_text if word not in ENGstopwords]
    return text

In [170]:
def list_to_str(tokenized_text):
    text = ' '.join(tokenized_text)
    return text

In [162]:
#remove special characters
df['text'] = df['text'].apply(lambda x: remove_punct(x))
# tokenize and lowers case
df['text'] = df['text'].apply(lambda x: tokenize(x.lower()))
# remove stopwords
df['text'] = df['text'].apply(lambda x: remove_stopwords(x))
# convert back to string
df['text'] = df['text'].apply(lambda x: list_to_str(x))

In [171]:
features = np.array(df.text)

Import TfidfVectorizer from sklearn. <br>
**Task:** Instatiate TfidfVectorizer with following parameters:

    - max_features = 2500
    - min_df = 7
    - max_df = 0.8
    - stop_words = stopwords.words('english')
    
    


In [174]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [175]:
vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))

**Bonus:** How would you determine optimal paraemeters for TfidfVectorizer? Discuss with your peers and/or mentors. Write down your answer below.

In [None]:
# I would use gridsearch 

**Task:** Transform features with vectorizer. 

In [181]:
X = vectorizer.fit_transform(features)

**Task:** Import train_test_split from sklearn and split the data.

In [182]:
from sklearn.model_selection import train_test_split

In [185]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

**Task:** Import any classifier of your choice from sklearn (e.g. Random Forest, LogReg, Naive Bayes).

## Naive bayes

In [186]:
from sklearn import naive_bayes

In [187]:
clf = naive_bayes.MultinomialNB()

In [188]:
clf.fit(X_train, y_train)

In [189]:
y_pred = clf.predict(X_test)

In [193]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [194]:
confusion_matrix(y_test, y_pred)

array([[2285,   43,   12],
       [ 482,  231,   25],
       [ 262,   54,  266]])

In [195]:
accuracy_score(y_test, y_pred)

0.7601092896174864

## Random Forest

In [197]:
from sklearn.ensemble import RandomForestClassifier

In [198]:
clf_forest = RandomForestClassifier()

In [199]:
clf_forest.fit(X_train, y_train)

In [201]:
y_pred2 = clf_forest.predict(X_test)

In [203]:
print(confusion_matrix(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2))

[[2188  100   52]
 [ 386  297   55]
 [ 180   79  323]]
0.7672131147540984


## Logistic Regression

In [205]:
from sklearn.linear_model import LogisticRegression

In [206]:
log_clf = LogisticRegression()

In [207]:
log_clf.fit(X_train, y_train)

In [210]:
y_pred3 = log_clf.predict(X_test)

In [211]:
print(confusion_matrix(y_test, y_pred3))
print(accuracy_score(y_test, y_pred3))

[[2208   93   39]
 [ 316  375   47]
 [ 147   76  359]]
0.8038251366120218


##  XGBoost

In [218]:
new_y = pd.DataFrame()

In [232]:
y = df['airline_sentiment'].replace({
    'negative': 0,
    'positive': 1,
    'neutral': 2,
})

In [233]:
y

0        2
1        1
2        2
3        0
4        0
        ..
14635    1
14636    0
14637    2
14638    0
14639    2
Name: airline_sentiment, Length: 14640, dtype: int64

In [234]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [235]:
from xgboost import XGBClassifier

In [236]:
xgb = XGBClassifier()

In [237]:
xgb.fit(X_train, y_train)

In [238]:
y_pred4 = xgb.predict(X_test)

In [239]:
print(confusion_matrix(y_test, y_pred4))
print(accuracy_score(y_test, y_pred4))

[[2166   58  116]
 [ 159  355   68]
 [ 378   59  301]]
0.7710382513661203
