In [20]:
import nltk
import pandas as pd
import nltk
import re
from string import punctuation
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langdetect import detect
from nltk.stem import PorterStemmer


# Reading the dataset

In [21]:
df = pd.read_csv('Tweets.csv', encoding="utf-8")
Y = df['airline_sentiment']
tweets = df['text']

# Filtering

In [16]:
for idx, tweet in enumerate(tweets):
    if(re.search(r'\b' + 'RT' + r'\b', tweet) or len(tweet)<20 or detect(tweet)!='en'):
        del tweets[idx]
        del Y[idx]

# Preprocessing

In [22]:
# URL stripping
tweets = [re.sub(r'https?:\/\/.*[\r\n]*', '', tweet).strip() for tweet in tweets]

# tweets = [''.join(character for character in text if ord(character)< 128)for text in tweets]

# tokenization
tweets_tokens = []
for tweet in tweets:
    tweets_tokens.append(word_tokenize(tweet))
    
# # punctuation
# punctuation += "“’…”"
# tweets_tokens = [''.join([char for char in text.decode('utf-8').encode('ascii') if char not in punctuation]) for text in tweets]

# case folding
tweets_tokens = [[word.lower() for word in token] for token in tweets_tokens]

#stemming
ps = PorterStemmer()
tweets_tokens = [[ps.stem(word) for word in token] for token in tweets_tokens]

# remove stopwords
stop_words = set(stopwords.words('english')) 
tweets_tokens = [[word for word in token if not word in stop_words] for token in tweets_tokens]

X = [' '.join(words) for words in tweets_tokens]

# Splitting

In [23]:
# Split the dataset into 80% training.
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)


# Calculating tf-idf 

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

# Calculating the tf-idf scores
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

# Training and predicting

In [217]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


# Initialize classifiers
NB = MultinomialNB()
KN = KNeighborsClassifier()
RF = RandomForestClassifier()

# Train classifiers
NB.fit(X_train_transformed, y_train)
KN.fit(X_train_transformed, y_train)
RF.fit(X_train_transformed, y_train)

# Predictions
NB_prediction = NB.predict(X_test_transformed)
KN_prediction = KN.predict(X_test_transformed)
RF_prediction = RF.predict(X_test_transformed)


             precision    recall  f1-score   support

   negative       0.68      1.00      0.81      1832
    neutral       0.83      0.11      0.19       586
   positive       0.93      0.15      0.26       429

avg / total       0.75      0.69      0.60      2847



# Evaluation


In [220]:
print classification_report(y_test, NB_prediction)

             precision    recall  f1-score   support

   negative       0.68      1.00      0.81      1832
    neutral       0.83      0.11      0.19       586
   positive       0.93      0.15      0.26       429

avg / total       0.75      0.69      0.60      2847



In [219]:
print classification_report(y_test, KN_prediction)

             precision    recall  f1-score   support

   negative       0.82      0.76      0.79      1832
    neutral       0.39      0.58      0.47       586
   positive       0.64      0.41      0.50       429

avg / total       0.70      0.67      0.68      2847



In [221]:
print classification_report(y_test, RF_prediction)

             precision    recall  f1-score   support

   negative       0.77      0.94      0.85      1832
    neutral       0.63      0.37      0.47       586
   positive       0.70      0.45      0.55       429

avg / total       0.73      0.75      0.72      2847

