In [1]:
import re
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet

The dataset used here has 10980 tweets for training and 3660 tweets for testing

In [2]:
train = pd.read_csv('0000000000002747_training_twitter_x_y_train.csv')
test  = pd.read_csv('0000000000002747_test_twitter_x_test.csv')

In [3]:
train.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [4]:
test.head()

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)
2,567879304593408001,Southwest,,DanaChristos,,1,@SouthwestAir can't believe how many paying cu...,,2015-02-17 18:52:31 -0800,CT,Eastern Time (US & Canada)
3,569757651539660801,US Airways,,rossj987,,0,@USAirways I can legitimately say that I would...,,2015-02-22 23:16:24 -0800,"Washington, D.C.",Eastern Time (US & Canada)
4,569900705852608513,American,,tranpham18,,0,@AmericanAir still no response from AA. great ...,,2015-02-23 08:44:51 -0800,New York City,Eastern Time (US & Canada)


In [5]:
#Making a collection for stopwords and punctuations for cleaning the tweets

stop = stopwords.words('english')
punc = list(string.punctuation)
stop = stop+punc

I tried different techniques to clean the tweets with Stemming (PorterStemmer, SnowballStemmer) and Lemmatization but in the end used Lemmatization because it gave better accuracy

In [6]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [7]:
from nltk.stem.snowball import SnowballStemmer
ss = SnowballStemmer(language='english')

PyEnchant was used as it provides a set of Python language bindings for the Enchant spellchecking library and is used to remove certain words that had spelling mistakes or were not words of the english dictionary

In [8]:
import enchant
d = enchant.Dict("en_US")

Now we extract the required parts of the data from the given datasets that will be required to train the model

In [9]:
train_data = np.array(train['text'])
test_data  = np.array(test['text'])

y_train = np.array(train['airline_sentiment'])

Now we clean the tweets by using Lemmatization techniques and using regular expressions to remove links. We also filter out the stopwords and punctuations

In [10]:
### Cleaning the tweets

def partOfSpeech(pos):
    if pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('N'):
        return wordnet.NOUN
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    

lemmatizer = WordNetLemmatizer()
def tweetModifier(tweet):

    tweet = tweet.lower()
    tweet = re.sub(r'^https?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE)
    
    pos_tweet = pos_tag(word_tokenize(tweet)) 
    m_tweets = list(map(lambda x: (x[0], partOfSpeech(x[1])), pos_tweet))
    
    m_tweet = []
    
    for i, tag in m_tweets:
        if i not in stop and d.check(i) and i.isalpha():
            cleaned = lemmatizer.lemmatize(i, tag)
            #cleaned = ss.stem(i)
            if(len(cleaned)>1 and d.check(cleaned)):
                m_tweet.append(cleaned)

    return m_tweet #returns the modified tweet

In [11]:
train_data=[(tweetModifier(tweet)) for tweet in train_data]
test_data=[(tweetModifier(tweet)) for tweet in test_data]

In [12]:
train_data

[['schedule',
  'morning',
  'day',
  'fact',
  'yes',
  'sure',
  'evening',
  'flight',
  'one',
  'cancel'],
 ['see',
  'worker',
  'time',
  'time',
  'go',
  'beyond',
  'love',
  'fly',
  'guy',
  'thank'],
 ['united', 'fly', 'ord', 'back', 'great', 'crew', 'service', 'leg', 'thanks'],
 ['horse', 'radish'],
 ['unite',
  'flight',
  'ord',
  'delay',
  'air',
  'force',
  'one',
  'last',
  'flight',
  'land'],
 ['united',
  'load',
  'fly',
  'sardine',
  'know',
  'pilot',
  'hour',
  'late',
  'flight',
  'incompetent',
  'beyond',
  'belief'],
 ['stock',
  'response',
  'delay',
  'frustrating',
  'poor',
  'amp',
  'tell',
  'wait',
  'amp',
  'come',
  'back'],
 ['nice',
  'hop',
  'rack',
  'enough',
  'mile',
  'take',
  'trip',
  'enjoy',
  'perfect',
  'latte',
  'city',
  'coffee'],
 ['united',
  'frankly',
  'bad',
  'customer',
  'service',
  'ever',
  'problem',
  'happen',
  'deal',
  'define',
  'company',
  'never',
  'united'],
 ['yeah', 'never', 'one', 'expensiv

In [13]:
x_train = []
x_test = []
for i in range (0, len(train_data)):
    x_train.append(' '.join(train_data[i]))
for i in range (0, len(test_data)):
    x_test.append(' '.join(test_data[i]))

We use TfidVectorizer in place of Count vectorizer as it gives much better accuracy

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

count_vec = TfidfVectorizer(stop_words= stop, max_df=0.95, min_df=0.001) 
x_train = count_vec.fit_transform(x_train)

In [15]:
x_train

<10980x1011 sparse matrix of type '<class 'numpy.float64'>'
	with 72237 stored elements in Compressed Sparse Row format>

In [16]:
x_test = count_vec.transform(x_test)

In [17]:
x_test

<3660x1011 sparse matrix of type '<class 'numpy.float64'>'
	with 23845 stored elements in Compressed Sparse Row format>

Now we use the different pre built models to find the one with highest accuracy

RANDOM FOREST

In [18]:
### RANDOM FOREST

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(x_train, y_train)

y_test=clf.predict(x_test)

np.savetxt('RandomForest.csv',y_test,delimiter = ',',fmt='%s')

Descision Tree

In [19]:
### Descision Tree
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

y_test = clf.predict(x_test)

np.savetxt('DescisionTree.csv',y_test,delimiter = ',',fmt='%s')

Multinomial Naive Bayes

In [20]:
### Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha = 1)
clf.fit(x_train, y_train)

y_test = clf.predict(x_test)

np.savetxt('MNaiveBayes.csv',y_test,delimiter = ',',fmt='%s')

Support Vector Machine

In [21]:
### SVM
from sklearn.svm import SVC

clf = SVC()
clf.fit(x_train, y_train)

y_test = clf.predict(x_test)

np.savetxt('SVM.csv',y_test,delimiter = ',',fmt='%s')

Logistic Regression

In [22]:
### Logistic Regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(x_train, y_train)

y_test = clf.predict(x_test)
np.savetxt('LogisticRegression.csv',y_test,delimiter = ',',fmt='%s')

The predicted values were tested on Coding Ninjas website and the highest accuracy was obtained by Logistic Regression and SVM models crossing 77%, Multinomial Naive Bayes and Random Forest both had accuracy around 75% and Descision Tree had accuracy nearing 70%