In [20]:
import pandas as pd 
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold

from scipy import stats


In [37]:
# Train set 3
df = pd.read_csv('../task2_data/task2_en_training.tsv', sep='\t')[['class', 'tweet']]
df_ex = pd.read_csv('../task2_data/data2_ex.csv')[['class', 'tweet']]
df_ex

Unnamed: 0,class,tweet
0,0,"depression hurts, cymbalta can help"
1,0,"@jessicama20045 right, but cipro can make thin..."
2,0,@fibby1123 are you on paxil .. i need help
3,0,@redicine the lamotrigine and sjs just made ch...
4,0,have decided to skip my #humira shot today. my...
...,...,...
21376,0,rt @15stephen15: #todolist 1-finish throat loz...
21377,0,me: how is your depression now that you�re on ...
21378,1,rt @ianibbo: finding out i'm allergic to fluox...
21379,0,@theantimyth @crossfitchemist unexpected and p...


In [36]:
def Model(df):
    Train = df.iloc[np.random.permutation(len(df))]
    df['class'] = df['class'].apply(lambda x: int(x))

    # Tokenization 
    X = Train['tweet']
    Y = Train['class']

    X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size=0.2)
    tfidf = TfidfVectorizer()

    skf = StratifiedKFold(n_splits=5)
    skf.get_n_splits(X, Y)
    print(skf) 

    print('Stratified K-Fold')
    f1=[] # array of f1 scores 
    for train_index, test_index in skf.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]

        tfidf.fit(X_train)
        X_train = tfidf.transform(X_train)
        X_test = tfidf.transform(X_test)

        SVM = SVC(kernel='linear')
        SVM.fit(X_train,y_train)

        predictions_SVM = SVM.predict(X_test)
        f1.append(round(f1_score(predictions_SVM, y_test, average='macro')*100,3))
        print("SVM F1 Score -> ", f1[-1])

    # avg f1
    print(f'F1 average:\t {round(np.array(f1).mean(), 3)} %')
    print(f'F1 std:\t {round(np.array(f1).std(), 3)}')
    print(f'F1 var:\t {round(np.array(f1).var(), 3)}')
    #print(stats.describe(f1))
    print()

print('working with dataset2...')
Model(df)
print('='*50)
print('working with dataset2_extended')
Model(df_ex)

working with dataset2...
StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
Stratified K-Fold
SVM F1 Score ->  62.211
SVM F1 Score ->  67.014
SVM F1 Score ->  68.334
SVM F1 Score ->  70.48
SVM F1 Score ->  53.6
F1 average:	 64.328 %
F1 std:	 6.011
F1 var:	 36.136

working with dataset2_extended
StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
Stratified K-Fold
SVM F1 Score ->  78.061
SVM F1 Score ->  77.27
SVM F1 Score ->  75.222
SVM F1 Score ->  74.173
SVM F1 Score ->  70.368
F1 average:	 75.019 %
F1 std:	 2.71
F1 var:	 7.342



# Dataset task_3

In [38]:
#Load the data into Pandas dataframe
df1 = pd.read_csv('../task3_data/task3_training.tsv', sep='\t')[['type', 'tweet']]
df1 = df1.fillna(0)
df1 = df1.replace('ADR', 1)
df1 = df1.iloc[np.random.permutation(len(df1))]
Train = df1

df2 = pd.read_csv('../task3_data/task3_validation.tsv', sep='\t')[['type', 'tweet']]
df2 = df2.fillna(0)
df2 = df2.replace('ADR', 1)
df2 = df2.iloc[np.random.permutation(len(df2))]
df2 = df2.dropna()
Test = df2

print(Train.shape, Train.shape)
print(Test.shape, Test.shape)

Train.head(10)

(2246, 2) (2246, 2)
(560, 2) (560, 2)


Unnamed: 0,type,tweet
1259,1,levaquin sucks. blinding headaches. vomiting. ...
664,0,@foodnetwork bravo for dumping dean. you won a...
2139,0,"headed to work, got a solid 3 hours of sleep l..."
343,1,@rachelhatesjazz watch your eyesight with lami...
1058,0,seriously want to hear of others experiences w...
495,1,my nigga dante addicted to that nicotine
1781,1,08.26 day 14 rivaroxaban diary. just been to c...
2072,1,i need to start taking my viibryd again but it...
1175,1,humira why you burn my veins
613,1,i stopped taking my seroquel because i hate it...


In [39]:
X_train = Train['tweet']
y_train = Train['type']

X_test = Test['tweet']
y_test = Test['type']

tfidf = TfidfVectorizer()
tfidf.fit(X_train)

X_train = tfidf.transform(X_train)
X_test = tfidf.transform(X_test)

f1=[]
SVM = SVC(kernel='linear')
SVM.fit(X_train,y_train)
predictions_SVM = SVM.predict(X_test)
f1.append(round(f1_score(predictions_SVM, y_test, average='macro')*100,3))
print("SVM F1 Score -> ", f1[-1])

SVM F1 Score ->  74.321


In [40]:
print(classification_report(y_test, predictions_SVM))

              precision    recall  f1-score   support

           0       0.74      0.57      0.65       195
           1       0.80      0.89      0.84       365

    accuracy                           0.78       560
   macro avg       0.77      0.73      0.74       560
weighted avg       0.78      0.78      0.77       560



In [98]:
X_train[1]

<1x5730 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

# FastText vactorized data

In [100]:
import fasttext


df_fasttext = pd.read_csv('../task2_data/data2_ex_fasttext.csv')[['class', 'tweet']]
df_fasttext['class'] = df_fasttext['class'].apply(lambda x: int(x))
df_fasttext['tweet'] = df_fasttext['tweet'].apply(lambda x: np.array(x))

Train = df_fasttext.iloc[np.random.permutation(len(df_fasttext))]
Train

Unnamed: 0,class,tweet
10803,0,[ 1.72466622e-03 1.04556698e-02 6.72125432e-...
12224,0,[-4.25047416e-04 9.64675378e-03 -2.68500275e-...
9983,0,[ 0.00193846 -0.0139621 0.00483886 0.005657...
9086,0,[ 0.00102627 0.00243027 -0.00176164 -0.002496...
5347,0,[ 1.5524051e-03 -9.0453317e-03 3.5807174e-03 ...
...,...,...
20673,0,[-6.0362910e-04 -2.8384433e-03 4.1719177e-03 ...
20181,1,[-0.00051116 -0.01081371 0.00475729 0.005804...
17338,0,[ 4.08772612e-04 1.10184364e-02 -3.80635750e-...
4897,0,[ 1.30958867e-03 1.38866045e-02 -5.62700583e-...


In [101]:
X = Train['tweet']
Y = Train['class']


# tfidf = TfidfVectorizer()
# tfidf.fit(X_train)

# X_train = tfidf.transform(X_train)
# X_test = tfidf.transform(X_test)
X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16917,), (4230,), (16917,), (4230,))

In [102]:
#X_train = np.array(X_train)
X_train

5760     [ 8.4203895e-04 -8.1642478e-04  1.0820375e-03 ...
10058    [ 0.00149613  0.00555444 -0.00300463 -0.005840...
1740     [ 0.00240864  0.01360526 -0.0042684  -0.006971...
3386     [ 0.00082583  0.01619679 -0.00402853 -0.010228...
6010     [ 0.00370547  0.03346064 -0.01379369 -0.018577...
                               ...                        
20751    [ 1.4772103e-03  1.8447092e-04 -2.4529296e-04 ...
15516    [ 0.00297261  0.02018336 -0.004667   -0.011547...
18963    [ 0.00024215  0.02325715 -0.01034474 -0.013331...
16063    [-1.0613265e-03  3.4231944e-03 -2.4256518e-03 ...
1775     [ 0.00077729 -0.01174619  0.00479401  0.006003...
Name: tweet, Length: 16917, dtype: object

In [104]:
SVM = SVC(kernel='linear')
SVM.fit(float(X_train),y_train)

TypeError: cannot convert the series to <class 'float'>

In [69]:
predictions_SVM = SVM.predict(X_test)
f1.append(round(f1_score(predictions_SVM, y_test, average='macro')*100,3))
print("SVM F1 Score -> ", f1[-1])

SVM F1 Score ->  74.321
