<h1 align=center>Text Classification</h1> 

This is our `hello world` example 

In [1]:
#!pip install eli5 --user

In [2]:
import eli5

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
# a helper functions and imports
from IPython.display import display
def highlight_col(x, df):
    #set by condition
    mask =  df['label'] == 'pos'
    mask2 = df['label'] == 'neg'
    x = pd.DataFrame('', index=df.index, columns=df.columns)
    x.loc[mask] = 'background-color: #e6ffe6'
    x.loc[mask2] = 'background-color: #ffe6e6'
    return x    

## Load the data

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

train_pos = pd.read_csv("train_Arabic_tweets_positive_20190413.tsv",
                        sep='\t', header=None)
train_neg = pd.read_csv("train_Arabic_tweets_negative_20190413.tsv",
                        sep='\t', header=None)
train_data = pd.concat([train_pos, train_neg], ignore_index=True)
train_data.columns = ['label', 'tweet']
del train_pos, train_neg

pd.set_option('display.max_colwidth', 100000)
df_tmp = train_data.sample(5)
df_tmp.style.apply(lambda x: highlight_col(x, df_tmp), axis=None)

Unnamed: 0,label,tweet
28671,neg,اللهم حبك وحب من يحبك وكل حب يقربنا إلى حبك ❤ يسعد صباحكوو ✋
36997,neg,ان شاء الله تحسون عاد 😂:) 💔
20451,pos,أوجاعك التي لم يلتفت لها أحد ، هي تحت عناية الله سبحانه ، ثق دائما بالله ولا تفقد الأمل انهو على كل شي قدير صبا…
37204,neg,هاذا الفساد بأم عينه 😭 الله يكفينا الشر
17042,pos,اصحى على الدنيا مطر 😍


In [6]:
test_pos = pd.read_csv("test_Arabic_tweets_positive_20190413.tsv",
                        sep='\t', header=None)
test_neg = pd.read_csv("test_Arabic_tweets_negative_20190413.tsv",
                        sep='\t', header=None)
test_data = pd.concat([test_pos, test_neg], ignore_index=True)
test_data.columns = ['label', 'tweet']
del test_pos, test_neg

df_tmp = test_data.sample(5)
df_tmp.style.apply(lambda x: highlight_col(x, df_tmp), axis=None)

Unnamed: 0,label,tweet
7096,neg,تلاتين سنة بترقص .. الليلة رقصتنا أنا ببكي 😭 دي حلاوة شديدة باخ
1213,pos,هذا كلام إعلامي يظهر في قناة (حكوميه) ! الدعوه سايبه يا حبيبي 🙂 و قبلها اساء لحمدالله لا إلتزام بالحياد و لا احترام…
11032,neg,هه 😂 💔 👊
914,pos,💎درر الجمعة💎 ۞ ﷽ ۞ . *(فارتدا علىٰ آثارهما قصصا)* 🌧.. تعلمنا سورة الكهف أن النجاح لا…
1750,pos,#ساعه_استجابه اللهم لا تحرمنا سعة رحمتك ، وسبوغ نعمتك ، وشمول عافيتك ، وجزيل عطائك ، أنت الحليم فلا تعجل ، وأنت ال…


## Baseline model (using pipeline)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline

vec = CountVectorizer()
clf = LogisticRegression()
pipe = make_pipeline(vec, clf)
pipe.fit(train_data.tweet, train_data.label);

Let's test this basic model

In [8]:
from sklearn import metrics

def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe, test_data.tweet, test_data.label)

              precision    recall  f1-score   support

         neg       0.77      0.82      0.80      5768
         pos       0.81      0.76      0.78      5752

    accuracy                           0.79     11520
   macro avg       0.79      0.79      0.79     11520
weighted avg       0.79      0.79      0.79     11520

accuracy: 0.792


## let's take a look inside the model

In [9]:
eli5.show_weights(clf, vec=vec, top=20)

Weight?,Feature
+2.779,الإخونج
+2.414,وصباحك
+2.216,هالسنه
+2.093,ابريل
+2.092,السحب
+2.086,الزرقاء
+2.075,برونو
+2.029,اللوك
+1.887,الطيب
+1.880,حكمة


## Try our model on some tweets

In [10]:
for _, row in test_data.sample(5).iterrows():
    print(f"true label: {row['label']}")
    display(eli5.show_prediction(clf, row['tweet'], vec=vec,))
    print("--"*50)

true label: neg


Contribution?,Feature
1.144,Highlighted in text (sum)
0.334,<BIAS>


----------------------------------------------------------------------------------------------------
true label: neg


Contribution?,Feature
0.549,Highlighted in text (sum)
-0.334,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
1.599,Highlighted in text (sum)
-0.334,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
1.479,Highlighted in text (sum)
-0.334,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
4.22,Highlighted in text (sum)
-0.334,<BIAS>


----------------------------------------------------------------------------------------------------


## Try Tfidf with some processing

In [11]:
vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5), min_df=.01, max_df=.3)
clf = LinearSVC()
pipe_tfidf = make_pipeline(vec, clf)
pipe_tfidf.fit(train_data.tweet, train_data.label)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char_wb', max_df=0.3, min_df=0.01,
                                 ngram_range=(3, 5))),
                ('linearsvc', LinearSVC())])

In [12]:
print_report(pipe_tfidf, test_data.tweet, test_data.label)

              precision    recall  f1-score   support

         neg       0.85      0.82      0.84      5768
         pos       0.83      0.86      0.84      5752

    accuracy                           0.84     11520
   macro avg       0.84      0.84      0.84     11520
weighted avg       0.84      0.84      0.84     11520

accuracy: 0.838


In [13]:
eli5.show_weights(clf, vec=vec, top=20)

Weight?,Feature
+5.905,😂
+5.245,💙
+5.222,💛
+4.580,🌹
+3.866,🤣
+3.858,❤
+3.727,😍
+2.864,تويت
+2.555,💪
… 772 more positive …,… 772 more positive …


In [14]:
for _, row in test_data.sample(5).iterrows():
    print(f"true label: {row['label']}")
    display(eli5.show_prediction(clf, row['tweet'], vec=vec,))
    print("--"*50)

true label: neg


Contribution?,Feature
0.656,Highlighted in text (sum)
-0.084,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
0.477,Highlighted in text (sum)
0.084,<BIAS>


----------------------------------------------------------------------------------------------------
true label: neg


Contribution?,Feature
2.188,Highlighted in text (sum)
-0.084,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
1.116,Highlighted in text (sum)
0.084,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
0.42,Highlighted in text (sum)
0.084,<BIAS>


----------------------------------------------------------------------------------------------------


# Feel free to play with notebook explore different models with different datasets