In [10]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import string
import re

In [11]:
train_data = pd.read_csv("0000000000002747_training_twitter_x_y_train.csv")

In [12]:
train_data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish üò§üê¥,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [13]:
train_data = train_data.drop(['airline_sentiment_gold','negativereason_gold','tweet_coord'],axis=1)

In [14]:
train_data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,name,retweet_count,text,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,ColeyGirouard,0,"@SouthwestAir I am scheduled for the morning, ...",2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,WalterFaddoul,0,@SouthwestAir seeing your workers time in and ...,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,LocalKyle,0,@united Flew ORD to Miami and back and had gr...,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,amccarthy19,0,@SouthwestAir @dultch97 that's horse radish üò§üê¥,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,J_Okayy,0,@united so our flight into ORD was delayed bec...,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


data cleaning

In [15]:
#remove words which are starts with @ symbols
train_data['text'] = train_data['text'].map(lambda x:re.sub('@\w*','',str(x)))
#remove special characters except [a-zA-Z]
train_data['text'] = train_data['text'].map(lambda x:re.sub('[^a-zA-Z]',' ',str(x)))
#remove link starts with https
train_data['text'] = train_data['text'].map(lambda x:re.sub('http.*','',str(x)))

In [17]:
train_data['text'].head()

0     I am scheduled for the morning    days after ...
1     seeing your workers time in and time out goin...
2     Flew ORD to Miami and back and  had great cre...
3                               that s horse radish   
4     so our flight into ORD was delayed because of...
Name: text, dtype: object

In [18]:
train_data['text'] = train_data['text'].map(lambda x:str(x).lower())

In [19]:
corpus = []

In [20]:
none=train_data['text'].map(lambda x:corpus.append(' '.join([word for word in str(x).strip().split() if not word in set(stopwords.words('english'))])))

In [22]:
corpus

['scheduled morning days fact yes sure evening flight one cancelled flightled',
 'seeing workers time time going beyond love flying guys thank',
 'flew ord miami back great crew service legs thanks',
 'horse radish',
 'flight ord delayed air force one last flight sbn mins landed',
 'load us flying sardine knew pilots hours late flight incompetent beyond belief',
 'stock response delays frustrating poor cust serv amp told ppl wait amp come back',
 'nice hoping rack enough miles take trip seattle enjoy perfect latte city coffee',
 'frankly worse customer service ever problems happen deal defines company never united',
 'yeah haha never one expensive much fun destinationdragons',
 'mco gt dca flight almost full people screwed msy dca cancelled flightation cancelled flight swa mistake',
 'easiest way get ticket receipt get one check get one online thanks',
 'love changes lounge cheese veggies olives addition crackers snack mix',
 'receive bad customer service ended spending several hundred

In [23]:
X = pd.DataFrame(data=corpus,columns=['comment_text'])

In [24]:
X.head()

Unnamed: 0,comment_text
0,scheduled morning days fact yes sure evening f...
1,seeing workers time time going beyond love fly...
2,flew ord miami back great crew service legs th...
3,horse radish
4,flight ord delayed air force one last flight s...


In [63]:
target = []
for i in train_data["airline_sentiment"]:
    target.append(i)

In [26]:
y = train_data['airline_sentiment'].map({'neutral':1,'negative':-1,'positive':1})

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
vector = TfidfVectorizer(stop_words='english',sublinear_tf=True,strip_accents='unicode',analyzer='word',token_pattern=r'\w{2,}',ngram_range=(1,1),max_features=30000)

In [33]:
X_train_word_feature = vector.fit_transform(X['comment_text']).toarray()

In [38]:
X_train_word_feature

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
test_data = pd.read_csv("0000000000002747_test_twitter_x_test.csv")

In [41]:
test_data = test_data.drop(['airline_sentiment_gold','negativereason_gold','tweet_coord'],axis=1)

In [42]:
#remove words which are starts with @ symbols
test_data['text'] = test_data['text'].map(lambda x:re.sub('@\w*','',str(x)))
#remove special characters except [a-zA-Z]
test_data['text'] = test_data['text'].map(lambda x:re.sub('[^a-zA-Z]',' ',str(x)))
#remove link starts with https
test_data['text'] = test_data['text'].map(lambda x:re.sub('http.*','',str(x)))

In [43]:
test_data['text'].head()

0     In car gng to DFW  Pulled over  hr ago   very...
1     after all  the plane didn t land in identical...
2     can t believe how many paying customers you l...
3     I can legitimately say that I would have rath...
4           still no response from AA  great job guys 
Name: text, dtype: object

In [47]:
test_data['text'] = test_data['text'].map(lambda x:str(x).lower())

In [46]:
corpus1 = []

In [48]:
none=test_data['text'].map(lambda x:corpus1.append(' '.join([word for word in str(x).strip().split() if not word in set(stopwords.words('english'))])))

In [49]:
X_test = pd.DataFrame(data=corpus1,columns=['comment_text'])

In [54]:
X_test_word_feature = vector.transform(X_test['comment_text']).toarray()

In [51]:
from sklearn.linear_model import LogisticRegression

In [52]:
classifier = LogisticRegression()

In [65]:
classifier.fit(X_train_word_feature, target)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [66]:
y_pred = classifier.predict(X_test_word_feature)

In [77]:
l = []
for i in range(len(y_pred)):
    l.append(y_pred[i])

In [79]:
data = pd.DataFrame(l)
##data

In [80]:
data.to_csv("t_p.csv", index = False, index_label = None, header=False)