# Text Classification

This is our `hello world` example 

Arabic Sentiment Twitter Corpus

This dataset we collected in April 2019. It contains 58K Arabic tweets (47K training, 11K test) tweets annotated in positive and negative labels. The dataset is balanced and collected using positive and negative emojis lexicon.

In [None]:
# Import the necessary library for displaying data frames with conditional formatting
from IPython.display import display

# Define a function for highlighting DataFrame cells based on conditions
def highlight_col(x, df):
    # Define conditions for highlighting positive and negative labels
    mask =  df['label'] == 'pos'   # Condition for positive label
    mask2 = df['label'] == 'neg'   # Condition for negative label
    
    # Create a DataFrame with empty strings, matching the shape of the input DataFrame 'df'
    x = pd.DataFrame('', index=df.index, columns=df.columns)
    
    # Apply background color based on the conditions
    x.loc[mask] = 'background-color: #e6ffe6'   # Green background for positive labels
    x.loc[mask2] = 'background-color: #ffe6e6'  # Red background for negative labels
    
    return x


## Load the data

In [14]:
# Import the necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Read positive and negative training data from CSV files into DataFrames
train_pos = pd.read_csv("../data/arabic-sentiment-twitter-corpus/train_Arabic_tweets_positive_20190413.tsv",
                        sep='\t', header=None)
train_neg = pd.read_csv("../data/arabic-sentiment-twitter-corpus/train_Arabic_tweets_negative_20190413.tsv",
                        sep='\t', header=None)

# Combine positive and negative training DataFrames into a single DataFrame
train_data = pd.concat([train_pos, train_neg], ignore_index=True)

# Rename the columns of the combined DataFrame for clarity
train_data.columns = ['label', 'tweet']

# Delete the individual DataFrames to free up memory
del train_pos, train_neg

# Set an option to display long text content in DataFrame cells
pd.set_option('display.max_colwidth', 100000)

# Sample a small portion (5 rows) of the training data for display
df_tmp = train_data.sample(5)

# Apply the 'highlight_col' function to style the sampled DataFrame
# with background colors based on the 'label' column
df_tmp.style.apply(lambda x: highlight_col(x, df_tmp), axis=None)


Unnamed: 0,label,tweet
13571,pos,كلابي وينكم مالكم حس 👣
11456,pos,ابتسامات الورد والفل لصباحك 🌷
33633,neg,انباء تؤكد غياب منصور الحربي عن لقاء النصر اليوم السبت بسبب الاصابة .. ☹ - البديل العمار الذي قدم مستوى اكثر من جي…
33296,neg,باقي انا اشد حيلي بالايديت ويطلع حلو 💔
18006,pos,بمناسبة فوز الهلال .. 💙 سحب على آيفون XR📱 رتويت وتابع - السحب بعد ساعة موثق بالفديو 💪


In [17]:
# Read positive and negative test data from CSV files into DataFrames
test_pos = pd.read_csv("../data/arabic-sentiment-twitter-corpus/test_Arabic_tweets_positive_20190413.tsv",
                        sep='\t', header=None)
test_neg = pd.read_csv("../data/arabic-sentiment-twitter-corpus/test_Arabic_tweets_negative_20190413.tsv",
                        sep='\t', header=None)

# Combine positive and negative test DataFrames into a single DataFrame
test_data = pd.concat([test_pos, test_neg], ignore_index=True)

# Rename the columns of the combined DataFrame for clarity
test_data.columns = ['label', 'tweet']

# Delete the individual DataFrames to free up memory
del test_pos, test_neg

# Sample a small portion (5 rows) of the test data for display
df_tmp = test_data.sample(5)

# Apply the 'highlight_col' function to style the sampled DataFrame
# with background colors based on the 'label' column
df_tmp.style.apply(lambda x: highlight_col(x, df_tmp), axis=None)


Unnamed: 0,label,tweet
50,pos,هذا الفيديو فيه معنى المثل اللي يقول ( ضربني وبكى وسبقني واشتكى ) 😂 عموما ودي داكوستا يجلده صدق عشان يعرف يمثل مره…
10449,neg,اللهم املأ قبر ميتنا بالرضا والنور والفسحة والسرور ياعزيز ياغفور .. #اللهم_ارحم_الوالد_اخوي 💔
2576,pos,كل شيء احبه فيه ، متغير وكل شيء ماحبه على حاله :)
2632,pos,خارج النص | - فرحة مشجع عماني في نهائي كأس جلالة السلطان قابوس 😂
5820,neg,ღ♬ . . أسلى مع طيفك الين إنتعشتك عشان مااحس بغيابك على طول ياجعلها في ذمتك ماوحشتك 💔 #تصميمي…


## Baseline model (using pipeline)

In [None]:
# Import the necessary libraries for text vectorization and classification
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline

# Create a CountVectorizer instance for text vectorization
vec = CountVectorizer()

# Create a LogisticRegression classifier for text classification
clf = LogisticRegression()

# Create a pipeline that combines the vectorizer and classifier
pipe = make_pipeline(vec, clf)

# Fit the pipeline to the training data, where 'train_data.tweet' contains text data
# and 'train_data.label' contains corresponding labels for classification
pipe.fit(train_data.tweet, train_data.label)


Let's test this basic model

In [50]:
# Import the necessary library for performance metrics
from sklearn import metrics

# Define a function 'print_report' for printing classification performance metrics
def print_report(pipe, x_test, y_test):
    # Predict labels using the provided pipeline and test data
    y_pred = pipe.predict(x_test)
    
    # Generate a classification report, including precision, recall, F1-score, and support
    report = metrics.classification_report(y_test, y_pred)
    
    # Print the classification report to the console
    print(report)
    
    # Calculate and print the accuracy score of the model's predictions
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print("accuracy: {:0.3f}".format(accuracy))

# Call the 'print_report' function with the trained pipeline, test text data, and labels
print_report(pipe, test_data.tweet, test_data.label)


              precision    recall  f1-score   support

         neg       0.77      0.82      0.80      5768
         pos       0.81      0.76      0.78      5752

    accuracy                           0.79     11520
   macro avg       0.79      0.79      0.79     11520
weighted avg       0.79      0.79      0.79     11520

accuracy: 0.792


## let's take a look inside the model

In [54]:
# Import the eli5 library, which is used for model interpretation and visualization
import eli5

# Show feature weights or importances of the classifier using eli5
# 'clf' is the trained LogisticRegression classifier
# 'vec' is the CountVectorizer used for text vectorization
# 'top=20' specifies that the top 20 features with the highest weights/importances should be displayed
eli5.show_weights(clf, vec=vec, top=20)


Weight?,Feature
+2.779,الإخونج
+2.414,وصباحك
+2.216,هالسنه
+2.093,ابريل
+2.092,السحب
+2.086,الزرقاء
+2.075,برونو
+2.029,اللوك
+1.887,الطيب
+1.880,حكمة


## Try our model on some tweets

In [58]:
# Loop through a sample of 5 rows from the test_data DataFrame
for _, row in test_data.sample(5).iterrows():
    # Print the true label for the current row
    print(f"true label: {row['label']}")
    
    # Use eli5 to display the model's prediction explanation for the current tweet
    # 'clf' is the trained LogisticRegression classifier
    # 'row['tweet']' is the text of the current tweet
    # 'vec' is the CountVectorizer used for text vectorization
    prediction_explanation = eli5.show_prediction(clf, row['tweet'], vec=vec)
    display(prediction_explanation)
    
    # Print a separator for readability
    print("--" * 50)


true label: pos


Contribution?,Feature
4.142,Highlighted in text (sum)
-0.334,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
2.064,Highlighted in text (sum)
-0.334,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
2.512,Highlighted in text (sum)
-0.334,<BIAS>


----------------------------------------------------------------------------------------------------
true label: neg


Contribution?,Feature
2.225,Highlighted in text (sum)
0.334,<BIAS>


----------------------------------------------------------------------------------------------------
true label: neg


Contribution?,Feature
0.969,Highlighted in text (sum)
0.334,<BIAS>


----------------------------------------------------------------------------------------------------


## Try Tfidf with some processing

In [61]:
# Create a TfidfVectorizer instance for text vectorization
# 'analyzer='char_wb'' specifies character-level analysis with word boundaries
# 'ngram_range=(3, 5)' includes character n-grams of length 3 to 5
# 'min_df=.01' sets the minimum document frequency for n-grams to 1%
# 'max_df=.3' sets the maximum document frequency for n-grams to 30%
vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5), min_df=.01, max_df=.3)

# Create a LinearSVC classifier for text classification
clf = LinearSVC()

# Create a pipeline that combines the TfidfVectorizer and LinearSVC classifier
pipe_tfidf = make_pipeline(vec, clf)

# Fit the pipeline to the training data, where 'train_data.tweet' contains text data
# and 'train_data.label' contains corresponding labels for classification
pipe_tfidf.fit(train_data.tweet, train_data.label)


Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char_wb', max_df=0.3, min_df=0.01,
                                 ngram_range=(3, 5))),
                ('linearsvc', LinearSVC())])

In [62]:
print_report(pipe_tfidf, test_data.tweet, test_data.label)

              precision    recall  f1-score   support

         neg       0.85      0.82      0.84      5768
         pos       0.83      0.86      0.84      5752

    accuracy                           0.84     11520
   macro avg       0.84      0.84      0.84     11520
weighted avg       0.84      0.84      0.84     11520

accuracy: 0.838


In [63]:
eli5.show_weights(clf, vec=vec, top=20)

Weight?,Feature
+5.905,😂
+5.245,💙
+5.222,💛
+4.580,🌹
+3.866,🤣
+3.858,❤
+3.727,😍
+2.864,تويت
+2.555,💪
… 772 more positive …,… 772 more positive …


In [64]:
for _, row in test_data.sample(5).iterrows():
    print(f"true label: {row['label']}")
    display(eli5.show_prediction(clf, row['tweet'], vec=vec,))
    print("--"*50)

true label: pos


Contribution?,Feature
0.415,Highlighted in text (sum)
0.084,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
0.14,Highlighted in text (sum)
0.084,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
1.428,Highlighted in text (sum)
0.084,<BIAS>


----------------------------------------------------------------------------------------------------
true label: neg


Contribution?,Feature
2.35,Highlighted in text (sum)
-0.084,<BIAS>


----------------------------------------------------------------------------------------------------
true label: neg


Contribution?,Feature
0.866,Highlighted in text (sum)
-0.084,<BIAS>


----------------------------------------------------------------------------------------------------
