In [2]:
import numpy as np 
import pandas as pd
import re 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [22]:
train = pd.read_csv("training_twitter_x_y_train.csv") #10980 rows 12 cols
test = pd.read_csv("test_twitter_x_test.csv")


In [7]:
y_train=np.array(train["airline_sentiment"])

In [23]:
train.drop( ['airline_sentiment_gold','name','tweet_id', 'retweet_count','tweet_created','user_timezone','tweet_coord','tweet_location'],axis=1,inplace=True)
test.drop( ['airline_sentiment_gold','name','tweet_id', 'retweet_count','tweet_created','user_timezone','tweet_coord','tweet_location'],axis=1,inplace=True)

In [24]:
train[train["negativereason_gold"].notnull()]

Unnamed: 0,airline_sentiment,airline,negativereason_gold,text
210,negative,American,Customer Service Issue,@AmericanAir @SouljaCoy what is AA going to do...
537,negative,United,Late Flight\nFlight Attendant Complaints,"@united I'm aware of the flight details, thank..."
579,negative,United,Can't Tell,@united couldn't have possibly messed up our t...
954,negative,United,Cancelled Flight\nCustomer Service Issue,@united rebooked 24 hours after original fligh...
1149,negative,Delta,Customer Service Issue,@DeltaAssist now at 57 minutes waiting on Silv...
1188,negative,American,Customer Service Issue,@AmericanAir FYI...call stilling getting dropp...
1473,negative,Southwest,Customer Service Issue,@SouthwestAir please send me a number to call ...
1582,negative,Southwest,Customer Service Issue,@SouthwestAir I've been on hold for over an ho...
1726,negative,American,Late Flight,"@AmericanAir Okay, I think 1565 has waited lon..."
2410,negative,American,Customer Service Issue,@AmericanAir no response to DM or email yet. ...


In [20]:
stops = stopwords.words('english')
stops += list(punctuation)
stops += ['flight','airline','flights','AA']


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [30]:
abbreviations = {'ppl': 'people','cust':'customer','serv':'service','mins':'minutes','hrs':'hours','svc': 'service',
           'u':'you','pls':'please'}

train_index = train[~train.negativereason_gold.isna()].index
test_index = test[~test.negativereason_gold.isna()].index

for index, row in train.iterrows():
    tweet = row.text
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet) #remove links
    tweet = re.sub('@[^\s]+','',tweet) #remove usernames
    tweet = re.sub('[\s]+', ' ', tweet) #remove additional whitespaces
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #replace #word with word
    tweet = tweet.strip('\'"') #trim tweet
    words = []
    for word in tweet.split():
        if word.lower() not in stops:
            if word in list(abbreviations.keys()):
                words.append(abbreviations[word])
            else:
                words.append(word.lower())   
    tweet = " ".join(words)
    tweet = " %s %s" % (tweet, row.airline)
    row.text = tweet
    if index in train_index:
        row.text = " %s %s" % (row.text, row.negativereason_gold)

for index, row in test.iterrows():
    tweet = row.text
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet) #remove links
    tweet = re.sub('@[^\s]+','',tweet) #remove usernames
    tweet = re.sub('[\s]+', ' ', tweet) #remove additional whitespaces
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #replace #word with word
    tweet = tweet.strip('\'"') #trim tweet
    words = []
    for word in tweet.split(): 
#         if not hasNumbers(word):
        if word.lower() not in stops:
            if word in list(abbreviations.keys()):
                words.append(abbreviations[word])
            else:
                words.append(word.lower())
    tweet = " ".join(words)
    tweet = " %s %s" % (tweet, row.airline)
    row.text = tweet
    if index in test_index:
        row.text = " %s %s" % (row.text, row.negativereason_gold)

del train['negativereason_gold']
del test['negativereason_gold']

In [31]:
def deEmojify(inputString):
     return inputString.encode('ascii', 'ignore').decode('ascii')

def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

for index, row in train.iterrows():
    row.text = deEmojify(row.text)

for index, row in test.iterrows():
    row.text = deEmojify(row.text)
    


for index, row in train.iterrows():
    words = row.text.split()
    new_words = []
    for word in words:
        if not hasNumbers(word):
            new_words.append(word)
    row.text = " ".join(new_words)
    
for index, row in test.iterrows():
    words = row.text.split()
    new_words = []
    for word in words:
        if not hasNumbers(word):
            new_words.append(word)
    row.text = " ".join(new_words)

In [33]:
v = TfidfVectorizer(analyzer='word', max_features=3150, max_df = 0.8, ngram_range=(1,1))
train_features= v.fit_transform(train.text)
test_features=v.transform(test.text)
clf = SVC()
clf.fit(train_features, train['airline_sentiment'])
pred = clf.predict(test_features)

In [44]:
s=pd.DataFrame(pred)
s.to_csv('prediction.csv', header=False, index=False)