<h1 align=center>Text Classification</h1> 

This is our `hello world` example 

In [None]:
# a helper functions and imports
from IPython.display import display
def highlight_col(x, df):
    #set by condition
    mask =  df['label'] == 'pos'
    mask2 = df['label'] == 'neg'
    x = pd.DataFrame('', index=df.index, columns=df.columns)
    x.loc[mask] = 'background-color: #e6ffe6'
    x.loc[mask2] = 'background-color: #ffe6e6'
    return x    

## Load the data

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

train_pos = pd.read_csv("../data/arabic-sentiment-twitter-corpus/train_Arabic_tweets_positive_20190413.tsv",
                        sep='\t', header=None)
train_neg = pd.read_csv("../data/arabic-sentiment-twitter-corpus/train_Arabic_tweets_negative_20190413.tsv",
                        sep='\t', header=None)
train_data = pd.concat([train_pos, train_neg], ignore_index=True)
train_data.columns = ['label', 'tweet']
del train_pos, train_neg

pd.set_option('display.max_colwidth', 100000)
df_tmp = train_data.sample(5)
df_tmp.style.apply(lambda x: highlight_col(x, df_tmp), axis=None)

In [None]:
test_pos = pd.read_csv("../data/arabic-sentiment-twitter-corpus/test_Arabic_tweets_positive_20190413.tsv",
                        sep='\t', header=None)
test_neg = pd.read_csv("../data/arabic-sentiment-twitter-corpus/test_Arabic_tweets_negative_20190413.tsv",
                        sep='\t', header=None)
test_data = pd.concat([test_pos, test_neg], ignore_index=True)
test_data.columns = ['label', 'tweet']
del test_pos, test_neg

df_tmp = test_data.sample(5)
df_tmp.style.apply(lambda x: highlight_col(x, df_tmp), axis=None)

## Baseline model (using pipeline)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline

vec = CountVectorizer()
clf = LogisticRegression()
pipe = make_pipeline(vec, clf)
pipe.fit(train_data.tweet, train_data.label);

Let's test this basic model

In [None]:
from sklearn import metrics

def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe, test_data.tweet, test_data.label)

## let's take a look inside the model

In [None]:
import eli5
eli5.show_weights(clf, vec=vec, top=20)

## Try our model on some tweets

In [None]:
for _, row in test_data.sample(5).iterrows():
    print(f"true label: {row['label']}")
    display(eli5.show_prediction(clf, row['tweet'], vec=vec,))
    print("--"*50)

## Try Tfidf with some processing

In [None]:
vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5), min_df=.01, max_df=.3)
clf = LinearSVC()
pipe_tfidf = make_pipeline(vec, clf)
pipe_tfidf.fit(train_data.tweet, train_data.label)

In [None]:
print_report(pipe_tfidf, test_data.tweet, test_data.label)

In [None]:
eli5.show_weights(clf, vec=vec, top=20)

In [None]:
for _, row in test_data.sample(5).iterrows():
    print(f"true label: {row['label']}")
    display(eli5.show_prediction(clf, row['tweet'], vec=vec,))
    print("--"*50)

# Feel free to play with notebook explore different models with different datasets