In [5]:
#necessary imports
from pandas import read_csv
from pandas import Series
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from string import punctuation
from numpy import savetxt

In [8]:
# loading the data
train_data =read_csv(r'/content/0000000000002747_training_twitter_x_y_train.csv')
test_data  =read_csv(r'/content/0000000000002747_test_twitter_x_test.csv')

In [9]:
# tain and test data shape
print("Train data :",train_data.shape)
print("Test data :",test_data.shape)

Train data : (10980, 12)
Test data : (3660, 11)


In [10]:
# lets have a look into the train_data

train_data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [11]:
#let us prepare our train and test data
train_X = train_data['text']
train_y = train_data['airline_sentiment']

test_X = test_data['text']

In [12]:
# have a look on the data
train_X[1]

'@SouthwestAir seeing your workers time in and time out going above and beyond is why I love flying with you guys. Thank you!'

In [14]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
# preparing the stop words

stop_words = set(stopwords.words('english'))
# taking the punctuation from the string module
punctuations = list(punctuation)
# adding the punctuation in the end of stop_words
stop_words.update(punctuations)

In [16]:
# function to return simple version of the pos_tag which can be used in lemmatization
def get_simple_pos_tag(tag):
    "take the pos_tag which and will return the simplified version of Wordnet format"
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# creating object for WordLemmatizer
lemma = WordNetLemmatizer()

def clean_tweet(tweet):
    # tokenizing the sentence to words
    words = word_tokenize(tweet)
    # output cleaned words
    cleaned_words = []
    
    # iterating over each word
    for word in words:
        # handling stop words
        if word.lower() not in stop_words:
            
            # getting the pos_tag of the word, passing in a list/array to get pos_tag for word rather character
            word_pos = pos_tag([word])
            
            #lemmetizing the word 
            lemmetized_word = lemma.lemmatize(word,get_simple_pos_tag(word_pos[0][1]))
            
            # appending the word to cleaned words
            cleaned_words.append(lemmetized_word.lower())
    
    # joining all the words to create a sentence
    cleaned_tweet = ' '.join(cleaned_words)
    
    # returning the cleaned_tweet
    return cleaned_tweet

In [18]:

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [20]:

import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [22]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [23]:
# testing the cleaned_tweet function
li = 'Hi there! How are you buddy ?'
clean_tweet(li)

'hi buddy'

In [24]:
#lets clean our text feature
cleaned_X = [clean_tweet(tweet) for tweet in train_X]
cleaned_test_X = [clean_tweet(tweet) for tweet in test_X]

In [25]:
count_vec = CountVectorizer(max_features= 1000,ngram_range=(1,2))
X_trained_features = count_vec.fit_transform(cleaned_X)
X_test_features = count_vec.transform(cleaned_test_X)

In [26]:
tfidf_vec = TfidfVectorizer(max_features=1000,ngram_range=(1,2),max_df=0.9,min_df=0.1)
X_trained_features = count_vec.fit_transform(cleaned_X)
X_test_features = count_vec.transform(cleaned_test_X)

In [28]:
# lets see the top feature names
count_vec.get_feature_names_out()

array(['000', '10', '100', '11', '12', '15', '19', '1hr', '1k', '1st',
       '1st class', '20', '20 min', '20 minute', '200', '2015', '22',
       '23', '24', '25', '2nd', '30', '30 min', '30 minute', '3rd', '40',
       '40 min', '40 minute', '45', '45 min', '45 minute', '50', '60',
       '75', '800', '90', 'aa', 'able', 'able get', 'absolutely',
       'acceptable', 'access', 'account', 'actually', 'add', 'address',
       'afternoon', 'agent', 'ago', 'air', 'aircraft', 'airline',
       'airline ever', 'airlines', 'airport', 'airways', 'all', 'allow',
       'almost', 'almost hour', 'already', 'also', 'always', 'amaze',
       'american', 'americanair', 'americanair call',
       'americanair cancelled', 'americanair flight', 'americanair get',
       'americanair need', 'americanair please', 'americanair thank',
       'americanair thanks', 'americanair try', 'americanair usairways',
       'americanair ve', 'americanair yes', 'americanairlines', 'amp',
       'announce', 'anothe

In [29]:
#Model Creation 
# Lets try the SVC

svc = SVC()
svc.fit(X_trained_features,train_y)

In [30]:
svc_pred = svc.predict(X_test_features)

# score on train data, althout train score won't help much for us
svc.score(X_trained_features,train_y)

0.8835154826958106

In [31]:
# saving the predictions to CSV
Series(svc_pred).to_csv('svc_pred.csv',index=False,header=False)

In [32]:
# trying the Naive Bayes Classifier
nbc = MultinomialNB()

nbc.fit(X_trained_features,train_y)

In [33]:
nbc_pred = nbc.predict(X_test_features)

In [34]:
# score on train data, althout train score won't help much for us
nbc.score(X_trained_features,train_y)

0.7727686703096539

In [35]:
# saving the predictions to CSV
Series(nbc_pred).to_csv('naive-bayes_pred.csv',index=False,header=False)

In [36]:
# trying the Random Forest Classifier
rfc = RandomForestClassifier()

rfc.fit(X_trained_features,train_y)

rfc_pred = rfc.predict(X_test_features)

# score on train data, although train score won't help much for us
rfc.score(X_trained_features,train_y)

0.988615664845173

In [37]:
# saving the predictions to CSV
Series(rfc_pred).to_csv('random-forest_pred.csv',index=False,header=False)