### Imports

In [3]:
import csv
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import string
import uritools
import urlextract
from langdetect import detect
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import normalize

  from numpy.core.umath_tests import inner1d


### Reading Data

In [4]:
extractor = urlextract.URLExtract()
ps = PorterStemmer()

data = {'airline_sentiment':[],'text':[]}
airline_sentiment = []
corpus = []

with open('Tweets.csv', 'r', encoding='utf8') as f:
    tweets = csv.reader(f)
    for row in tweets:        
        data['airline_sentiment'].append(row[1])
        data['text'].append(row[10])

print("Data Read")

Data Read


### Normalizing, Stemming, CaseFolding and Removing Stop Words from Text

In [5]:
def clean(words):
    urls = extractor.find_urls(words+" ")
    for url in urls:
        words = words.replace(url,'')
    tknzr = TweetTokenizer()
    words = tknzr.tokenize(words)
    exclude = set(string.punctuation)
    words = [word.lower() for word in words if not word.lower() in exclude]
    words = [word.lower() for word in words 
            if not word in set(stopwords.words('english')) and not word.isdigit()]
    words = [ps.stem(word) for word in words]
    words = ' '.join(words)
    return words

### Similarity in Text

In [6]:
def similarity(docs):
    vectorizer = TfidfVectorizer()
    Docsdf = vectorizer.fit_transform(docs)
    Docsdf = (Docsdf * Docsdf.T).A
    a = 1
    b = 0
    for a in range(len(Docsdf)):
        for b in range(a):
            x = Docsdf[b][len(Docsdf)-a]
            if(x>0.9 and not (len(Docsdf)-a == b)):
                del docs[b]
                del data['airline_sentiment'][b]
                break
    return docs

### Filtering Text

In [7]:
def CleanWithoutFilter():
    corpus = []
    corpusText=''
    with open('Tweets.csv',  encoding='utf8') as File:
        spamreader = csv.reader(File)
        for row in spamreader:       
            corpusText =  clean(row[10])
            corpus.append(corpusText)
    return corpus

def CleanWithFilter():
    corpus = []
    corpusText=''
    counter = 0
    with open('Tweets.csv',  encoding='utf8') as File:
        spamreader = csv.reader(File)
        for row in spamreader:       
            corpusText =  clean(row[10])
            if(not(corpusText.__contains__("RT") or (len(corpusText )<20) or (detect(row[10])=="en"))):
                corpus.append(corpusText)
                counter+=1
            else:
                del data['airline_sentiment'][counter]
    
    corpusFinal = similarity(corpus)
    return corpusFinal

### Tf-idf Vectorizer

In [8]:
def vectorizerFunction(filterOrNoFilter):
    vectorizer = TfidfVectorizer(stop_words='english')
    X_train, X_test, y_train, y_test = train_test_split(filterOrNoFilter,  data['airline_sentiment'], test_size = 0.2)
    vectorizer.fit(X_train)
    XTrain = vectorizer.transform(X_train)
    XTest = vectorizer.transform(X_test)
    return XTrain, XTest, y_train, y_test

### Multinomial Naive Bayes Classifier

In [9]:
def MNBClassifier(XTrain, XTest, y_train, y_test):
    clf = MultinomialNB(alpha = 1.0, class_prior = None, fit_prior = True)
    clf.fit(XTrain, y_train)
    predictions = clf.predict(XTest)
    score = f1_score(y_test, predictions, average = 'micro')  
    print(score)

### K Nearest Neighbour Classifier

In [10]:
def KNeighbourClassifiers(XTrain, XTest, y_train, y_test):
    neigh = KNeighborsClassifier(n_neighbors = 5)
    neigh.fit(XTrain, y_train) 
    predictions = neigh.predict(XTest)
    score = f1_score(y_test, predictions, average = 'micro')  
    print(score)

### Random Forest Classifier

In [11]:
def RForestClassifiers(XTrain, XTest, y_train, y_test):
    clf = RandomForestClassifier(random_state = 0)
    clf.fit(XTrain, y_train)
    predictions = clf.predict(XTest)
    score = f1_score(y_test, predictions, average = 'micro')  
    print(score)

### Calling Tfidf Vectorizer without Filtering

In [12]:
XTrain, XTest, y_train, y_test = vectorizerFunction(CleanWithoutFilter())

### Classifiers without Filter F1_Score

In [13]:
MNBClassifier(XTrain, XTest, y_train, y_test)

0.6712188460225332


In [14]:
KNeighbourClassifiers(XTrain, XTest, y_train, y_test)

0.692045066575623


In [15]:
RForestClassifiers(XTrain, XTest, y_train, y_test)

0.7275520655513827


### Calling Tfidf Vectorizer with Filtering

In [16]:
XTrain, XTest, y_train, y_test = vectorizerFunction(CleanWithFilter())

### Classifiers with Filter F1_Score

In [17]:
MNBClassifier(XTrain, XTest, y_train, y_test)

0.6153846153846154


In [18]:
KNeighbourClassifiers(XTrain, XTest, y_train, y_test)

0.5384615384615384


In [19]:
RForestClassifiers(XTrain, XTest, y_train, y_test)

0.5
