In [1]:
import numpy as np
import pandas as pd

import itertools as itertools

from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split,validation_curve
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.stem import PorterStemmer
from nltk import FreqDist
import nltk.data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize

import re



# Functions

In [2]:
def stemWord(sen):
    tokens = word_tokenize(sen)
    result = ''
    ps = PorterStemmer()
    for wordToken in tokens:
        result = result + ' ' + ps.stem(wordToken)
    return result

def checkEnglish(sen, english_vocab):
    totalCount = len(sen.split())
    counter = 0
    for word in sen:
        if word in english_vocab:
            counter = counter + 1
    
    return((counter/totalCount) >= 0.15)

# Base Implementation: No Filtrations

In [24]:
# Read the tweets
df = pd.read_csv('Tweets.csv')

# Remove URLs in tweet texts
df['text'] = df.apply(lambda row: re.sub(r"http\S+", "", row.text), axis=1)

# Stem the words
df['text'] = df.apply(lambda row: stemWord(row.text), axis=1)

# Store the Target variable
Y = df['airline_sentiment']

# Filter the reviewText column in the dataset
X = df['text']

# Split the dataset into training and testing sets with a ration of 0.8:0.2
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=1)

# Define tfidf vectorizer with maximum of 40000 features
tfidf_vectorizer = TfidfVectorizer(max_features = 40000,sublinear_tf=False, analyzer='word', stop_words='english',strip_accents='ascii')
X_train_transformed = tfidf_vectorizer.fit_transform(X_train)
X_test_transformed = tfidf_vectorizer.transform(X_test)

clfM1 = MultinomialNB()

clfM1.fit(X_train_transformed, Y_train)
Y_pred = clfM1.predict(X_test_transformed)
print('Multinomial Bayes Before Filtration Report:\n', metrics.classification_report(Y_test, Y_pred))

clfK1 = KNeighborsClassifier()

clfK1.fit(X_train_transformed, Y_train)
Y_pred = clfK1.predict(X_test_transformed)
print('K-Neighbours Before Filtration Report:\n', metrics.classification_report(Y_test, Y_pred))

clfR1 = RandomForestClassifier()

clfR1.fit(X_train_transformed, Y_train)
Y_pred = clfR1.predict(X_test_transformed)
print('Random Forest Before Filtration Report:\n', metrics.classification_report(Y_test, Y_pred))

Multinomial Bayes Before Filtration Report:
               precision    recall  f1-score   support

    negative       0.67      0.99      0.80      1826
     neutral       0.73      0.14      0.23       611
    positive       0.88      0.17      0.28       491

   micro avg       0.68      0.68      0.68      2928
   macro avg       0.76      0.43      0.44      2928
weighted avg       0.72      0.68      0.59      2928

K-Neighbours Before Filtration Report:
               precision    recall  f1-score   support

    negative       0.78      0.85      0.81      1826
     neutral       0.48      0.42      0.45       611
    positive       0.67      0.56      0.61       491

   micro avg       0.71      0.71      0.71      2928
   macro avg       0.64      0.61      0.62      2928
weighted avg       0.70      0.71      0.70      2928





Random Forest Before Filtration Report:
               precision    recall  f1-score   support

    negative       0.77      0.92      0.84      1826
     neutral       0.56      0.37      0.45       611
    positive       0.73      0.50      0.59       491

   micro avg       0.74      0.74      0.74      2928
   macro avg       0.69      0.60      0.63      2928
weighted avg       0.72      0.74      0.72      2928



# Bonus Number 4: Filtrations on the Airline Tweets

In [25]:
# Read the tweets
df = pd.read_csv('Tweets.csv')

# Remove URLs in tweet texts
df['text'] = df.apply(lambda row: re.sub(r"http\S+", "", row.text), axis=1)

# Choose tweets with length less greater than or equal to 20 characters
df['length'] = df.apply(lambda row: len(row.text), axis=1)
df = df[df.length>20]

# Check english words
english_vocab = [w.lower() for w in nltk.corpus.words.words()]
english_freq = FreqDist(english_vocab)
english_freq = english_freq.most_common(2000)
commonWords, _ = zip(*english_freq)
commonWords = commonWords[:2000]
df['english'] = df.apply(lambda row: checkEnglish(row.text, commonWords), axis=1)
df = df[df.english == True]

# Remove retweets
df = df[~df.text.str.contains('RT')]

# Choose selected columns 
selectedColummns = ['airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'text']
df = df[selectedColummns]

# Fill NAs with given values
values = {'negativereason': '', 'negativereason_confidence': 0}
df = df.fillna(value=values)

# Choose tweets with confidence more than 50%
df = df[ ((df['airline_sentiment_confidence'] >= 0.5) | (df['negativereason_confidence'] >= 0.5)) ]

# Stem the words
df['text'] = df.apply(lambda row: stemWord(row.text), axis=1)
# Form tfidf vecctorizer from the tweet texts
text = df['text']
fullTfidf_vectorizer = TfidfVectorizer(max_features = 40000,sublinear_tf=False, analyzer='word', stop_words='english',strip_accents='ascii')
Xtransformed = fullTfidf_vectorizer.fit_transform(text)

# Create dataframe from tfidf vectorizer
tfidf = pd.DataFrame(Xtransformed.toarray(), columns=fullTfidf_vectorizer.get_feature_names())

# Compute cosine similarity for the tweets
cosSimilarity = (cosine_similarity(tfidf)).tolist()

# Set empty set and indices used for looping over all tweets
indices = set()
rowNumber = 0
itemNumber = 0

# Loop over all tweets and their cosine similarity with other tweets
for row in cosSimilarity:
    itemNumber = 0
    # Loop over all cosine similarities of the other tweets with the tweet being inspected in the outer loop
    for other in row:
        # Check that we are not comparing the tweet's cosine similarity with itself
        if (rowNumber != itemNumber):
            if (other >= 0.9):
                # Add the index of the other tweets if the similarity exceeds 90%
                indices.add(itemNumber)
        itemNumber = itemNumber + 1
    rowNumber = rowNumber + 1

# Remove the tweets with the indices present in the indices set
df = df.drop(df.index[list(indices)])
# Reset the index
df = df.reset_index(drop=True)

print('Unique counts for each rating:\n',df.airline_sentiment.value_counts())
# Set the number of rows to select from each sentiment class the minimum number of tweets present in a sentiment class
numberOfRows = (df.airline_sentiment.value_counts()).min()
# Select a number of rows from each sentiment class
df = pd.concat([(df.loc[df['airline_sentiment'] == 'neutral']).head(numberOfRows), 
                (df.loc[df['airline_sentiment'] == 'negative']).head(numberOfRows),
                (df.loc[df['airline_sentiment'] == 'positive']).head(numberOfRows)
                ])
print('\nShape after equal distribution: ',df.shape)

# Shuffle the rows
df = df.sample(frac=1).reset_index(drop=True)

# Store the Target variable
Y = df['airline_sentiment']

# Filter the reviewText column in the dataset
X = df['text']

# Split the dataset into training and testing sets with a ration of 0.8:0.2
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=1)

# Define tfidf vectorizer with maximum of 40000 features
tfidf_vectorizer = TfidfVectorizer(max_features = 40000,sublinear_tf=False, analyzer='word', stop_words='english',strip_accents='ascii')
X_train_transformed = tfidf_vectorizer.fit_transform(X_train)
X_test_transformed = tfidf_vectorizer.transform(X_test)

clfM2 = MultinomialNB()

clfM2.fit(X_train_transformed, Y_train)
Y_pred = clfM2.predict(X_test_transformed)
print('Multinomial Bayes After Filtration Report:\n', metrics.classification_report(Y_test, Y_pred))

clfK2 = KNeighborsClassifier()

clfK2.fit(X_train_transformed, Y_train)
Y_pred = clfK2.predict(X_test_transformed)
print('K-Neighbours After Filtration Report:\n', metrics.classification_report(Y_test, Y_pred))

clfR2 = RandomForestClassifier()

clfR2.fit(X_train_transformed, Y_train)
Y_pred = clfR2.predict(X_test_transformed)
print('Random Forest After Filtration Report:\n', metrics.classification_report(Y_test, Y_pred))

Unique counts for each rating:
 negative    8810
neutral     2603
positive    1998
Name: airline_sentiment, dtype: int64

Shape after equal distribution:  (5994, 5)
Multinomial Bayes After Filtration Report:
               precision    recall  f1-score   support

    negative       0.74      0.88      0.81       385
     neutral       0.75      0.58      0.66       395
    positive       0.77      0.80      0.79       419

   micro avg       0.76      0.76      0.76      1199
   macro avg       0.76      0.76      0.75      1199
weighted avg       0.76      0.76      0.75      1199

K-Neighbours After Filtration Report:
               precision    recall  f1-score   support

    negative       0.67      0.71      0.69       385
     neutral       0.60      0.59      0.60       395
    positive       0.72      0.69      0.71       419

   micro avg       0.66      0.66      0.66      1199
   macro avg       0.66      0.66      0.66      1199
weighted avg       0.66      0.66      0.66  



Random Forest After Filtration Report:
               precision    recall  f1-score   support

    negative       0.71      0.88      0.78       385
     neutral       0.64      0.58      0.61       395
    positive       0.75      0.65      0.70       419

   micro avg       0.70      0.70      0.70      1199
   macro avg       0.70      0.71      0.70      1199
weighted avg       0.70      0.70      0.70      1199



# Bonus Number 3: Sentiment140 Dataset + Filtrations

In [4]:
# Read the tweets
df = pd.read_csv('training.1600000.processed.noemoticon.csv', engine = 'python', 
                 names=['polarity','id','date','query','user','text'])

# Convert polarity feature to string rather than int
df['polarity'] = df['polarity'].astype('str')

len(df.text.unique())

1581466

In [17]:
type(df["text"])

str

In [26]:
df.iloc[54].text

'I need a hug '

In [28]:
text = df["text"]
display(df[text.isin(text[text.duplicated()])])
df.loc[df.text == (df.iloc[54].text)]

Unnamed: 0,polarity,id,date,query,user,text
54,0,1467821455,Mon Apr 06 22:22:32 PDT 2009,NO_QUERY,CiaraRenee,I need a hug
128,0,1467841832,Mon Apr 06 22:27:55 PDT 2009,NO_QUERY,bgoers,I'm so cold
213,0,1467863684,Mon Apr 06 22:33:35 PDT 2009,NO_QUERY,DjGundam,Awwh babs... you look so sad underneith that s...
230,0,1467872175,Mon Apr 06 22:35:50 PDT 2009,NO_QUERY,edsed,I still can't find my keys.
238,0,1467872759,Mon Apr 06 22:35:59 PDT 2009,NO_QUERY,Augustina22CA,"im lonely keep me company! 22 female, california"
275,0,1467880442,Mon Apr 06 22:38:04 PDT 2009,NO_QUERY,iCalvin,Haven't tweeted nearly all day Posted my webs...
357,0,1467900545,Mon Apr 06 22:43:31 PDT 2009,NO_QUERY,brookes4402,homework....
364,0,1467901500,Mon Apr 06 22:43:49 PDT 2009,NO_QUERY,thegeach,feeling down
398,0,1467912842,Mon Apr 06 22:46:53 PDT 2009,NO_QUERY,KimberlyKane,@danadearmond
465,0,1467930017,Mon Apr 06 22:51:48 PDT 2009,NO_QUERY,Glycel,stuck at home


Unnamed: 0,polarity,id,date,query,user,text
54,0,1467821455,Mon Apr 06 22:22:32 PDT 2009,NO_QUERY,CiaraRenee,I need a hug
35285,0,1565156502,Mon Apr 20 06:03:00 PDT 2009,NO_QUERY,1Song,I need a hug
84168,0,1753529442,Sun May 10 02:05:57 PDT 2009,NO_QUERY,liedra,I need a hug
92500,0,1760069887,Sun May 10 20:35:44 PDT 2009,NO_QUERY,DarianFroseth,I need a hug
103005,0,1795358529,Thu May 14 07:37:55 PDT 2009,NO_QUERY,AlexandraTheSpy,I need a hug
109872,0,1824686179,Sun May 17 02:28:47 PDT 2009,NO_QUERY,krystalcyo,I need a hug
141634,0,1881307284,Fri May 22 04:16:17 PDT 2009,NO_QUERY,Suff0cat,I need a hug
230755,0,1978810915,Sun May 31 00:32:57 PDT 2009,NO_QUERY,melombardo,I need a hug
242916,0,1981423369,Sun May 31 09:07:20 PDT 2009,NO_QUERY,xshmodie,I need a hug
279499,0,1991805662,Mon Jun 01 07:41:45 PDT 2009,NO_QUERY,Jaydaboo18,I need a hug


Unnamed: 0,polarity,id,date,query,user,text


In [4]:
# Read the tweets
df = pd.read_csv('training.1600000.processed.noemoticon.csv', engine = 'python', 
                 names=['polarity','id','date','query','user','text'])

# Convert polarity feature to string rather than int
df['polarity'] = df['polarity'].astype('str')

# Shuffle the rows
df = df.sample(frac=1).reset_index(drop=True)

# Choose 1st 20000 tweets to prevent memory errors
df = df.head(20000)

# Remove URLs in tweet texts
df['text'] = df.apply(lambda row: re.sub(r"http\S+", "", row.text), axis=1)

# Choose tweets with length less greater than or equal to 20 characters
df['length'] = df.apply(lambda row: len(row.text), axis=1)
df = df[df.length>20]

# Check english words
english_vocab = [w.lower() for w in nltk.corpus.words.words()]
english_freq = FreqDist(english_vocab)
english_freq = english_freq.most_common(2000)
commonWords, _ = zip(*english_freq)
commonWords = commonWords[:2000]
df['english'] = df.apply(lambda row: checkEnglish(row.text, commonWords), axis=1)
df = df[df.english == True]

# Remove retweets
df = df[~df.text.str.contains('RT')]

# Stem the words
df['text'] = df.apply(lambda row: stemWord(row.text), axis=1)
# Form tfidf vecctorizer from the tweet texts
text = df['text']
fullTfidf_vectorizer = TfidfVectorizer(max_features = 40000,sublinear_tf=False, analyzer='word', stop_words='english',strip_accents='ascii')
Xtransformed = fullTfidf_vectorizer.fit_transform(text)

# Create dataframe from tfidf vectorizer
tfidf = pd.DataFrame(Xtransformed.toarray(), columns=fullTfidf_vectorizer.get_feature_names())

# Compute cosine similarity for the tweets
cosSimilarity = (cosine_similarity(tfidf)).tolist()

# Set empty set and indices used for looping over all tweets
indices = set()
rowNumber = 0
itemNumber = 0

# Loop over all tweets and their cosine similarity with other tweets
for row in cosSimilarity:
    itemNumber = 0
    # Loop over all cosine similarities of the other tweets with the tweet being inspected in the outer loop
    for other in row:
        # Check that we are not comparing the tweet's cosine similarity with itself
        if (rowNumber != itemNumber):
            if (other >= 0.9):
                # Add the index of the other tweets if the similarity exceeds 90%
                indices.add(itemNumber)
        itemNumber = itemNumber + 1
    rowNumber = rowNumber + 1

# Remove the tweets with the indices present in the indices set
df = df.drop(df.index[list(indices)])
# Reset the index
df = df.reset_index(drop=True)

print('Unique counts for each rating:\n',df.polarity.value_counts())
# Set the number of rows to select from each sentiment class the minimum number of tweets present in a sentiment class
numberOfRows = (df.polarity.value_counts()).min()
# Select a number of rows from each sentiment class
df = pd.concat([(df.loc[df['polarity'] == '2']).head(numberOfRows), 
                (df.loc[df['polarity'] == '0']).head(numberOfRows),
                (df.loc[df['polarity'] == '4']).head(numberOfRows)
                ])
print('\nShape after equal distribution: ',df.shape)

# Shuffle the rows
df = df.sample(frac=1).reset_index(drop=True)

# Store the Target variable
Y = df['polarity']

# Filter the reviewText column in the dataset
X = df['text']

# Split the dataset into training and testing sets with a ration of 0.8:0.2
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=1)

# Define tfidf vectorizer with maximum of 40000 features
tfidf_vectorizer = TfidfVectorizer(max_features = 40000,sublinear_tf=False, analyzer='word', stop_words='english',strip_accents='ascii')
X_train_transformed = tfidf_vectorizer.fit_transform(X_train)
X_test_transformed = tfidf_vectorizer.transform(X_test)

clfM2 = MultinomialNB()

clfM2.fit(X_train_transformed, Y_train)
Y_pred = clfM2.predict(X_test_transformed)
print('Multinomial Bayes After Filtration Report:\n', metrics.classification_report(Y_test, Y_pred))

clfK2 = KNeighborsClassifier()

clfK2.fit(X_train_transformed, Y_train)
Y_pred = clfK2.predict(X_test_transformed)
print('K-Neighbours After Filtration Report:\n', metrics.classification_report(Y_test, Y_pred))

clfR2 = RandomForestClassifier()

clfR2.fit(X_train_transformed, Y_train)
Y_pred = clfR2.predict(X_test_transformed)
print('Random Forest After Filtration Report:\n', metrics.classification_report(Y_test, Y_pred))

Unique counts for each rating:
 4    9622
0    9417
Name: polarity, dtype: int64

Shape after equal distribution:  (18834, 8)
Multinomial Bayes After Filtration Report:
               precision    recall  f1-score   support

           0       0.69      0.78      0.74      1871
           4       0.75      0.66      0.70      1896

   micro avg       0.72      0.72      0.72      3767
   macro avg       0.72      0.72      0.72      3767
weighted avg       0.72      0.72      0.72      3767

K-Neighbours After Filtration Report:
               precision    recall  f1-score   support

           0       0.58      0.82      0.68      1871
           4       0.70      0.42      0.53      1896

   micro avg       0.62      0.62      0.62      3767
   macro avg       0.64      0.62      0.61      3767
weighted avg       0.64      0.62      0.60      3767





Random Forest After Filtration Report:
               precision    recall  f1-score   support

           0       0.67      0.73      0.70      1871
           4       0.71      0.65      0.68      1896

   micro avg       0.69      0.69      0.69      3767
   macro avg       0.69      0.69      0.69      3767
weighted avg       0.69      0.69      0.69      3767



In [5]:
clfM2

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)