### Import required modules

In [1]:
import nltk
import operator
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from textblob import TextBlob
from sklearn.model_selection import cross_val_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from senti_classifier import senti_classifier
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
from sklearn.svm import SVC

wordnet_lemmatizer = WordNetLemmatizer()
tknzr = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=False)
stopWords = set(stopwords.words('english'))

### Summary statistics of data set

In [1]:
f = open("train.txt", 'r',  encoding='utf-8')
tweets = []
pos_tweets = []
neg_tweets = []
ID = []
Class = []

for line in f:
    word = line.split()
    ID.append(word[0])
    Class.append(word[1])
    tweets.append(" ".join(word[2:]))
    if word[1] == "N": 
        pos_tweets.append((" ".join(word[2:]), word[1]))
    else:
        neg_tweets.append((" ".join(word[2:]), word[1]))
                                        
f.close()

In [2]:
print("How many tweets in train set: ", len(tweets))
print("How many 'N': ", len(pos_tweets), "instances and", '{:.3%}'.format(len(pos_tweets)/len(tweets)), "of the training data")
print("How many 'Y': ", len(neg_tweets), "instances and", '{:.3%}'.format(len(neg_tweets)/len(tweets)), "of the training data")

How many tweets in train set:  3166
How many 'N':  2793 instances and 88.219% of the training data
How many 'Y':  373 instances and 11.781% of the training data


In [3]:
f = open("dev.txt", 'r',  encoding='utf-8')

dev_tweets = []
dev_pos_tweets = []
dev_neg_tweets = []

for line in f:
    word = line.split()
    dev_tweets.append((" ".join(word[2:]), word[1]))
    if word[1] == "N": 
        dev_pos_tweets.append((" ".join(word[2:]), word[1]))
    else:
        dev_neg_tweets.append((" ".join(word[2:]), word[1]))
                                        
f.close()

In [4]:
print("How many tweets in dev set: ", len(dev_tweets))
print("How many 'N': ", len(dev_pos_tweets), "instances and", '{:.3%}'.format(len(dev_pos_tweets)/len(dev_tweets)), "of the dev data")
print("How many 'Y': ", len(dev_neg_tweets), "instances and", '{:.3%}'.format(len(dev_neg_tweets)/len(dev_tweets)), "of the dev data")

How many tweets in dev set:  1076
How many 'N':  962 instances and 89.405% of the dev data
How many 'Y':  114 instances and 10.595% of the dev data


In [5]:
total = len(dev_tweets) + len(tweets)
total_pos = len(dev_pos_tweets) + len(pos_tweets)
total_neg = len(dev_neg_tweets) + len(neg_tweets)
print("How many tweets in train + dev set: ", total)
print("How many 'N': ", total_pos, "instances and", '{:.3%}'.format(total_pos/total), "of the train + dev data")
print("How many 'Y': ", total_neg, "instances and", '{:.3%}'.format(total_neg/total), "of the train + dev data")

How many tweets in train + dev set:  4242
How many 'N':  3755 instances and 88.520% of the train + dev data
How many 'Y':  487 instances and 11.480% of the train + dev data


### Parse tweets

In [56]:
def parse_tweets(f):
    import pandas as pd
    tweets=[]
    ID=[]
    label=[]
    for line in f:
        tweets.append(line.split('\t')[2].strip())    
        ID.append(line.split('\t')[0].strip())    
        label.append(line.split('\t')[1].strip())
    
    df = pd.DataFrame()
    df['ID'] = ID
    df['Class'] = label
    df['Tweet'] = tweets
    
    return df

In [111]:
f1 = open("train.txt", 'r',  encoding='utf-8')
f2 = open("dev.txt", 'r',  encoding='utf-8')
f3 = open("test.txt", 'r',  encoding='utf-8')

train = parse_tweets(f1)
dev = parse_tweets(f2)
test = parse_tweets(f3)

train_dev = train.append(dev, ignore_index=True)
train_dev_test = train_dev.append(test, ignore_index=True)

f1.close()
f2.close()
f3.close()

In [112]:
train.head()

Unnamed: 0,ID,Class,Tweet
0,326376825590779905,N,Do U know what Medications are R for bipolar d...
1,326398829849092097,Y,I think my tablets have made me gain weight. A...
2,326406322323066883,Y,Thought of work is overwhelming me so much I f...
3,326407491460141056,N,@awakenings_ ziprasidone and olanzapine I.m. B...
4,326453069795688449,N,#كيف_تتخلص_من_الاكتئاب جرب Venlafaxine


### Will tweets' lengths give us some clues?

In [39]:
def tweets_length(df):
    a = df[df['Class'] == 'N']
    b = df[df['Class'] == 'Y']
    
    return (a['Tweet'].str.len().mean(), b['Tweet'].str.len().mean())

In [44]:
print(tweets_length(train))
print(tweets_length(dev))

(96.59362692445399, 107.50134048257372)
(97.10498960498961, 105.99122807017544)


### What about punctuations?

In [481]:
def count_punctuation(df):
    a = df[df['Class'] == 'N']
    b = df[df['Class'] == 'Y']
    return (a['Tweet'].str.count(r'\W').mean(), b['Tweet'].str.count(r'\W').mean())

def count_digits(df):
    a = df[df['Class'] == 'N']
    b = df[df['Class'] == 'Y']
    return (a['Tweet'].str.count(r'\d').mean(), b['Tweet'].str.count(r'\d').mean())

def count_Caps(df):
    a = df[df['Class'] == 'N']
    b = df[df['Class'] == 'Y']
    return (a['Tweet'].str.count(r'[A-Z]').mean(), b['Tweet'].str.count(r'[A-Z]').mean())

def count_elongated_words(df):
    a = df[df['Class'] == 'N']
    b = df[df['Class'] == 'Y']
    return (a['Tweet'].str.count(r'(.)\1{3}').mean(), b['Tweet'].str.count(r'(.)\1{3}').mean())

# def count_quantity(df):
#     a = df[df['Class'] == 'N']
#     b = df[df['Class'] == 'Y']
#     return (a['Tweet'].str.count(r'\d[a-zA-Z]{1,3}\s').mean(), b['Tweet'].str.count(r'\d[a-zA-Z]{1,3}\s').mean())


In [482]:
print(count_punctuation(train))
print(count_punctuation(dev))
print('\n')
print(count_digits(train))
print(count_digits(dev))
print('\n')
print(count_Caps(train))
print(count_Caps(dev))
print('\n')
print(count_elongated_words(train))
print(count_elongated_words(dev))
# print('\n')
# print(count_quantity(train))
# print(count_quantity(dev))

(19.82205513784461, 22.171581769436997)
(19.805613305613306, 21.649122807017545)


(0.9477264590046545, 0.7479892761394102)
(1.0322245322245323, 0.8157894736842105)


(5.426781238811314, 4.458445040214477)
(5.618503118503119, 5.43859649122807)


(0.058360186179735055, 0.05093833780160858)
(0.05405405405405406, 0.03508771929824561)


In [None]:
def add_tweet_level_features(df):
    a = pd.DataFrame()
    a['tweet_length'] = df['Tweet'].str.len()
    a['count_digits'] = df['Tweet'].str.count(r'\d')
    a['count_punctuations'] = df['Tweet'].str.count(r'\W')
    a['count_CAPITALs'] = df['Tweet'].str.count(r'[A-Z]')
    a['count_elongated_words'] = df['Tweet'].str.count(r'(\w)\1{3}')
    return a

a = add_tweet_level_features(train_dev_test_tweets)

In [488]:
# train_new = a.iloc[:3166].copy()
# dev_new = all_vectorized.iloc[3166:4242].copy()
# test_new = all_vectorized.iloc[4242:].copy()
# train94 = add_features(train94, a.iloc[:3166].copy())
dev94 = add_features(dev94, a.iloc[3166:4242].copy())
test94 = add_features(test94, a.iloc[4242:].copy())

### Let's clean the tweets a little bit!

In [63]:
def tokenizer(df):
    list_of_tokens = [tknzr.tokenize(x) for x in df['Tweet']]
    list_of_tokens = [cleaning(x) for x in list_of_tokens]
    list_of_tokens = pd.DataFrame([" ".join(x) for x in list_of_tokens], columns=["Tweet"])
    return list_of_tokens
    
def cleaning(tokens):
    a = []
    for x in tokens:
        if re.match(r'^\W+$', x):
            continue
        if re.match('@\w+', x):
            continue
        if re.match('#\w+', x):
            a.append(x.replace("#", ""))
            continue
        if re.match('.*\d.*', x):
            continue   
        if re.match(r'\W+', x):
            continue
        a.append(wordnet_lemmatizer.lemmatize(x))     
    return a

In [4]:
# f = open('adr_sample.txt', 'r', encoding = 'utf-8')
# adr_sample = []
# for line in f:
#     adr_sample.append(line.strip())
# f.close()
# adr_sample = pd.DataFrame(adr_sample, columns=['Tweet'])

In [114]:
train_tweets = tokenizer(train)
dev_tweets = tokenizer(dev)
test_tweets = tokenizer(test)
train_dev_test_tweets = tokenizer(train_dev_test)

In [86]:
# dic = nltk.FreqDist([item for sublist in [tknzr.tokenize(x) for x in list(adr_sample['Tweet'])] for item in sublist])
# sorted(dic.items(), key=operator.itemgetter(1), reverse=True)[:3]

### Add bigram and trigram

In [451]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vect = CountVectorizer(min_df=10, ngram_range=(2,3), analyzer='word').fit(train_dev_test_tweets['Tweet'])
all_vectorized = vect.transform(train_dev_test_tweets['Tweet'])
all_vectorized

<5329x747 sparse matrix of type '<class 'numpy.int64'>'
	with 17059 stored elements in Compressed Sparse Row format>

In [452]:
all_vectorized = pd.SparseDataFrame(all_vectorized, columns=list(vect.get_feature_names()))

In [453]:
all_vectorized.sort_index(axis=1, inplace=True)
all_vectorized.fillna(0, inplace=True)
all_vectorized = all_vectorized.to_dense()

In [457]:
def add_features(df, df2):
    df2.reset_index(drop = True, inplace = True)
    labels = df[df.columns[-1]]
    df.drop(df.columns[-1], axis=1, inplace=True)
    df = pd.concat([df, df2], axis=1)
    df = pd.concat([df, labels], axis=1)
    return df

In [459]:
train_new = all_vectorized.iloc[:3166].copy()
train94 = add_features(train94, train_new)

In [460]:
dev_new = all_vectorized.iloc[3166:4242].copy()
dev94 = add_features(dev94, dev_new)

In [None]:
test_new = all_vectorized.iloc[4242:].copy()
test94 = add_features(test94, test_new)

In [524]:
# a = test_all[test_all.columns[list(range(0,2598))]].copy()
# a = pd.concat([a, test94], axis=1)
# a.drop(a.columns[-1], axis=1, inplace=True)
# a = pd.concat([a, test_all.iloc[:, 2598:]], axis=1)

### CSV I/O

In [2]:
train_all = pd.read_csv('train_all.csv')
dev_all = pd.read_csv('dev_all.csv')
test_all = pd.read_csv('test_all.csv')

In [32]:
test_all.to_csv('test_all.csv', sep=',', encoding='utf-8', index=False)

In [None]:
test_all.drop(test_all.columns[2598:2691], axis=1, inplace=True)

### Machine Learning

In [30]:
# from sklearn.svm import LinearSVC
# from sklearn.feature_selection import SelectFromModel
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import roc_auc_score
# X_train = train_all.iloc[:,:-1].copy()
# y_train = train_all.iloc[:,-1].copy()
# X_test = dev_all.iloc[:,:-1].copy()
# y_test = dev_all.iloc[:,-1].copy()

In [29]:
# clf = Pipeline([
#   ('feature_selection', SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False))),
#   ('classification', LinearSVC())
# ])
# clf.fit(X_train, y_train)

In [24]:
# predictions = clf.predict(X_test)

In [28]:
# roc_auc_score(y_test, predictions)

### Generate polarity score and subjectivity score for tweets

In [None]:
analyzer = SentimentIntensityAnalyzer()

polarity = []
subjectivity = []
compound_score = []

for item in train:
    x = Textcleaner(item)
    polarity.append(TextBlob(x).sentiment[0])
    subjectivity.append(TextBlob(x).sentiment[1])
    compound_score.append(analyzer.polarity_scores(item)['compound'])

In [None]:
train_csv = pd.read_csv('train.csv')

In [67]:
train_csv['polarity'] = polarity
train_csv['subjectivity'] = subjectivity
train_csv['compound_score'] = compound_score

### Side effect lexicon

In [59]:
f = open('se.txt','r')
def parse_se(f):
    se = []
    for line in f:
        se.append(line.strip())
    return se
side = parse_se(f)
f.close()

### Tokenize tweets

In [137]:
train = [tknzr.tokenize(x) for x in train]
dev = [tknzr.tokenize(x) for x in dev]
test = [tknzr.tokenize(x) for x in test]

In [139]:
train = [" ".join(x) for x in train]
dev = [" ".join(x) for x in dev]
test = [" ".join(x) for x in test]

In [60]:
def side_effect(tweets):
    adr=[]
    for x in tweets:
        count=0
        for y in side:
            if y in x:
                count=1
                adr.append(1)
                break
        if count == 0:
            adr.append(0)
    return adr

### SentiWordNet Score

In [72]:
def Textcleaner(tweets):
    return [re.sub(r'\W+', ' ', x).strip().lower() for x in tweets] 

In [74]:
train = Textcleaner(train)
dev = Textcleaner(dev)
test = Textcleaner(test)

In [75]:
def score_sentiWordNet(doc):
    sentences = nltk.sent_tokenize(doc)
    stokens = [nltk.word_tokenize(sent) for sent in sentences]
    taggedlist=[]
    for stoken in stokens:        
         taggedlist.append(nltk.pos_tag(stoken))
    wnl = nltk.WordNetLemmatizer()

    score_list=[]
    for idx,taggedsent in enumerate(taggedlist):
        score_list.append([])
        for idx2,t in enumerate(taggedsent):
            newtag=''
            lemmatized=wnl.lemmatize(t[0])
            if t[1].startswith('NN'):
                newtag='n'
            elif t[1].startswith('JJ'):
                newtag='a'
            elif t[1].startswith('V'):
                newtag='v'
            elif t[1].startswith('R'):
                newtag='r'
            else:
                newtag=''       
            if(newtag!=''):    
                synsets = list(swn.senti_synsets(lemmatized, newtag))
                #Getting average of all possible sentiments, as you requested        
                score=0
                if(len(synsets)>0):
                    for syn in synsets:
                        score+=syn.pos_score()-syn.neg_score()
                    score_list[idx].append(score/len(synsets))

    
    sentence_sentiment=[]

    for score_sent in score_list:
        try:
            sentence_sentiment.append(sum([word_score for word_score in score_sent])/len(score_sent))
        except ZeroDivisionError:
            sentence_sentiment.append(0)
    
    return sentence_sentiment

In [None]:
a=[]
b=[]
c=[]
for x in train:
    a.append(score_sentiWordNet(x)[0])

In [None]:
for x in dev:
    b.append(score_sentiWordNet(x)[0])
for x in test:
    c.append(score_sentiWordNet(x)[0])

### Feature engineering

In [423]:
allwords=[]
for x in train+dev+test:
    allwords.extend(x[1])

In [466]:
wordlist = nltk.FreqDist(allwords)
word_features = sorted(wordlist.items(), key=operator.itemgetter(1),reverse=True)
word_features

### Text classification

In [None]:
# def extract_features_bernoulli(document):
#     document_words = set(document)
#     features = {}
#     for word in word_features:
#         features['contains({})'.format(word[0])] = (word[0] in document_words)
#     return features

In [134]:
# training_set = nltk.classify.apply_features(extract_features, tweets_words)

In [20]:
# test_set = nltk.classify.apply_features(extract_features, test_words)

In [23]:
# classifierNB = nltk.NaiveBayesClassifier.train(training_set)

In [468]:
# classifierNB.show_most_informative_features(20)

In [26]:
# prediction = classifierNB.classify_many([x[0] for x in test_set])

In [467]:
# print("The accuracy score of NB Classifier is:", len([x for index, x in enumerate(prediction) if x == test_words[index][1]])/len(prediction))
# [test_tweets[index][0] for index, x in enumerate(prediction) if x == 'Y' and test_words[index][1] == 'N']
# [test_tweets[index][0] for index, x in enumerate(prediction) if x == 'N' and test_words[index][1] == 'Y']

In [446]:
# X_train = train.iloc[:,:-1]
# y_train = np.ravel(train.iloc[:,-1:])
# X_test = dev.iloc[:,:-1]
# y_test = np.ravel(dev.iloc[:,-1:])

In [None]:
# total.drop(['id', 'person', 'but', 'am', 'been', 'is', 'it', 'next', 
#             'bananas', 'can', 'this', 'pic', 'rt', 'same', 'though'],axis=1)

In [472]:
# X = total.iloc[:,:-1]
# y = np.ravel(total.iloc[:,-1:])
# total.columns

In [458]:
# clf = MultinomialNB()
# clf.fit(X_train, y_train)
# predictions = clf.predict(X_test)

In [471]:
# clf.score(X_test, y_test)

In [470]:
# cv_scores = cross_val_score(clf, X, y, cv=10)
# cv_scores.mean()

In [469]:
# test = pd.read_csv('test.csv')
# test = test.iloc[:,:-1]

# test_prediction = knn.predict(test)
# test_prediction

In [363]:
# senti = []
# analyzer = SentimentIntensityAnalyzer()
# for sentence in new_sample:
#     vs = analyzer.polarity_scores(" ".join(sentence[1]))
#     senti.append((" ".join(sentence[1]), vs))