In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import daar.nlp_helpers as nh

In [2]:
# Load the data, and vocab
    
data_path = '../input/dialect-processed-data/preprocessed_data_no_stem.obj'
# vocab_path = os.path.join(root_path, 'vocab_no_stem.obj')
# labels_path = os.path.join(root_path, 'labels_to_int.obj')

In [3]:
data = nh.load_pickle_file(data_path)

In [4]:
data.head()

Unnamed: 0,id,text,dialect
0,1175358310087892992,"[بالنهايه, ينتفض, يغير]",IQ
1,1175416117793349632,"[يعني, محسوب, البشر, حيونه, ووحشيه, وتطلبون, ا...",IQ
2,1175450108898565888,"[مبين, كلامه, خليجي]",IQ
3,1175471073770573824,"[يسلملي, مرورك, وروحك, الحلوه]",IQ
4,1175496913145217024,"[وين, الغيبه, محمد]",IQ


In [5]:
len(data)

458197

In [6]:
data['tweet_length'] = [len(tweet) for tweet in data['text']]

In [7]:
threshold = 2
data = data[data['tweet_length'] > threshold]

In [8]:
data.reset_index(inplace=True, drop=True)

In [9]:
len(data)

449784

### Convert each tweet to one string
Each tweet is a list of words, we want to return it back as one string separated by spaces

In [10]:
data_string = data['text'].apply(nh.get_one_string)

In [11]:
data_string.head()

0                                  بالنهايه ينتفض يغير
1    يعني محسوب البشر حيونه ووحشيه وتطلبون الغرب يح...
2                                     مبين كلامه خليجي
3                            يسلملي مرورك وروحك الحلوه
4                                      وين الغيبه محمد
Name: text, dtype: object

In [12]:
# labels_to_int: dictionary, where its keys are the labels,
# and its values are integers starting from 0
labels = data['dialect'].unique()
labels_to_int, int_to_labels = nh.get_mappings(labels, i=0)

In [13]:
# save labels_to_int, and int_to_labels, they will be needed in inference
nh.save_pickle_file(labels_to_int, 'labels_to_int.obj')
nh.save_pickle_file(int_to_labels, 'int_to_labels.obj')

In [14]:
split_frac = 0.2

## split data into training, validation, and test data (features and labels, x and y)
X = data_string.values
y = data['dialect'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=split_frac, shuffle=True, 
                                                  random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, shuffle=True, 
                                                random_state=42, stratify=y_val)
## print out the shapes of your resultant feature data
print('\t\t\tFeatures Shapes:')
print('Train set: \t\t{}'.format(X_train.shape),
     '\nValidation set: \t{}'.format(X_val.shape),
     '\nTest set: \t\t{}'.format(X_test.shape))

			Features Shapes:
Train set: 		(359827,) 
Validation set: 	(44978,) 
Test set: 		(44979,)


In [15]:
# setting ngram_range to (1, 2) to count individual words and bigram words (sequence of two words)
tfidf = TfidfVectorizer(ngram_range=(1, 2), dtype=np.float64)
classifier = MultinomialNB()

pipe = Pipeline([('vectorizer', tfidf), ('classifier', classifier)])
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer(ngram_range=(1, 2))),
                ('classifier', MultinomialNB())])

In [16]:
y_hat = pipe.predict(X_test)

In [17]:
print('Predicted:\n', y_hat)
print('Actual:\n', y_test)

Predicted:
 ['PL' 'PL' 'EG' ... 'PL' 'EG' 'KW']
Actual:
 ['JO' 'PL' 'EG' ... 'PL' 'AE' 'BH']


In [18]:
from sklearn.metrics import f1_score

f1_score(y_test, y_hat, average='macro')

0.24427983971932546

In [19]:

# pipe2 = Pipeline([('vectorizer', tfidf), ('classifier', SVC())])

# pipe2.fit(X_train, y_train)

In [20]:
y_hat = pipe.predict(X_test)
print(y_hat)
print(y_test)

['PL' 'PL' 'EG' ... 'PL' 'EG' 'KW']
['JO' 'PL' 'EG' ... 'PL' 'AE' 'BH']


In [21]:
y_hat = pipe.predict(X_val)
f1_score(y_val, y_hat, average='macro')

0.24172671478331006

In [22]:
def test(classifier):
    pipe_line = Pipeline([('vectorizer', tfidf), ('classifier', classifier)])
    pipe_line.fit(X_train, y_train)

    y_hat = pipe_line.predict(X_val)

    print('Predicted:\n', y_hat)
    print('Actual:\n', y_val)
    print(f1_score(y_val, y_hat, average='macro'))
    nh.save_pickle_file(pipe_line, 'pipe_rf_20.obj')
    return f1_score(y_val, y_hat, average='macro')

In [23]:
rf = RandomForestClassifier(n_estimators=20, random_state=42, class_weight='balanced')
test(rf)

Predicted:
 ['LY' 'EG' 'EG' ... 'SA' 'KW' 'PL']
Actual:
 ['BH' 'PL' 'DZ' ... 'SA' 'OM' 'PL']
0.3585734302190894


0.3585734302190894

In [24]:
nh.save_pickle_file(rf, 'rf_20.obj')