# importing packages 

In [4]:

# Packages for data analysis
import pandas as pd
import numpy as np
import time

# Packages for visualizations
import seaborn as sns
import matplotlib.style as style

# Packages for preprocessing
import nltk
import string
import re
#from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

# Packages for training models
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
#import xgboost as xgb
from sklearn.pipeline import Pipeline

#from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

import time


# Model Evaluation Packages
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt
%matplotlib inline

# Style
sns.set(font_scale=1.5)
style.use('seaborn-pastel')
style.use('seaborn-poster')

In [5]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/MamaAmakah/nltk_data...


True

In [6]:
# importing the dataset
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')
sample_submission = pd.read_csv('sample_submission.csv')

Some EDA to determine how to treat data

In [7]:
print(train['text'].head(7)) #looking thetext column

0    umgaqo-siseko wenza amalungiselelo kumaziko ax...
1    i-dha iya kuba nobulumko bokubeka umsebenzi na...
2    the province of kwazulu-natal department of tr...
3    o netefatša gore o ba file dilo ka moka tše le...
4    khomishini ya ndinganyiso ya mbeu yo ewa maana...
5    dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6    kgetse nngwe le nngwe e e sa faposiwang mo tsh...
Name: text, dtype: object


In [8]:
train.lang_id.value_counts() #checking for value counts

eng    3000
tso    3000
tsn    3000
afr    3000
nbl    3000
ssw    3000
zul    3000
xho    3000
nso    3000
ven    3000
sot    3000
Name: lang_id, dtype: int64

In [9]:
# Taking general overview at both datasets
print('TRAINING DATA')
print('============='+('\n'))
print('Shape of the dataset: {}\n'.format(train.shape))
print('Total Number of unique tweets: {}\n'.format(len(set(train['text']))))
print('Total Number of missing values:\n{}\n\n'.format(train.isnull().sum()))
print('TEST DATA')
print('========='+('\n'))
print('Shape of the dataset: {}\n'.format(test.shape))
print('Total Number of unique tweets: {}\n'.format(len(set(test['text']))))
print('Total Number of missing values:\n{}\n' .format(test.isnull().sum()))

TRAINING DATA

Shape of the dataset: (33000, 2)

Total Number of unique tweets: 29948

Total Number of missing values:
lang_id    0
text       0
dtype: int64


TEST DATA

Shape of the dataset: (5682, 2)

Total Number of unique tweets: 5459

Total Number of missing values:
index    0
text     0
dtype: int64



Some data engineering before modelling data

In [10]:
#function to clean our data, removing punctuation,extra spaces etc
def clean_text(text):
    text=re.sub('<.*?>', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub("\n"," ",text)
    text = text.lower()
    text=' '.join(text.split())
    return text

In [11]:
# Application of the function to clean the tweets
train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

In [12]:
# Replace '.txt' with 'text file'
train["text"] = train["text"].str.replace(".txt", " text file")
test["text"] = test["text"].str.replace(".txt", " text file")

  train["text"] = train["text"].str.replace(".txt", " text file")
  test["text"] = test["text"].str.replace(".txt", " text file")


# Modelling

#divide training data into features and target variable

In [15]:
X = train['text']
y = train['lang_id']

In [16]:
#splitdatainto training and validation data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10)

LINEAR SVC REGRESSION

In [17]:
#linear SVC model
Lsvc=LinearSVC(random_state=42)
#fit the model
clf_text = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                               max_df=0.9,
                                               ngram_range=(1, 2))),
                     ('clf', Lsvc)])

# Logging the Execution Time for each model
start_time = time.time()
clf_text.fit(X_train, y_train)
run_time = time.time()-start_time

Lsvc_pred = clf_text.predict(X_val)

In [18]:
#Lsvc metrics
F1_Macro_Lsvc=metrics.f1_score(y_val,Lsvc_pred,average='macro')
print("F1 Macro:")
print(F1_Macro_Lsvc)
F1_Accuracy_Lsvc=metrics.f1_score(y_val, Lsvc_pred,average='micro')
print("F1 Accuracy:")
print(F1_Accuracy_Lsvc)
F1_Weighted_Lsvc=metrics.f1_score(y_val,Lsvc_pred,average='weighted')
print("F1 Weighted:")
print(F1_Weighted_Lsvc)
Execution_Time_Lsvc=run_time
print("Executuion Time:")
print(Execution_Time_Lsvc)

F1 Macro:
0.997303941557524
F1 Accuracy:
0.9972727272727273
F1 Weighted:
0.9972735388876004
Executuion Time:
11.32358694076538


LOGISTIC REGRESSION

In [20]:
lr=LogisticRegression(random_state=42,
                                  multi_class='ovr',
                                  n_jobs=1,
                                  C=1e5,
                                  max_iter=4000)
#fit the model
clf_lr = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                               max_df=0.9,
                                               ngram_range=(1, 2))),
                     ('clf', lr)])

        # Logging the Execution Time for each model
start_time = time.time()
clf_lr.fit(X_train, y_train)
run_time = time.time()-start_time

lr_pred = clf_lr.predict(X_val)

In [21]:
#Lr metrics
F1_Macro_Lr=metrics.f1_score(y_val,lr_pred,average='macro')
print("F1 Macro:")
print(F1_Macro_Lr)
F1_Accuracy_Lr=metrics.f1_score(y_val, lr_pred,average='micro')
print("F1 Accuracy:")
print(F1_Accuracy_Lr)
F1_Weighted_Lr=metrics.f1_score(y_val,lr_pred,average='weighted')
print("F1 Weighted:")
print(F1_Weighted_Lr)
Execution_Time_Lr=run_time
print("Executuion Time:")
print(Execution_Time_Lr)

F1 Macro:
0.9976154555126836
F1 Accuracy:
0.9975757575757576
F1 Weighted:
0.9975765940407467
Executuion Time:
153.14912295341492


KN NEIGHBOURS CLASSIFIER

In [22]:
kn=KNeighborsClassifier(n_neighbors=5)
#fit the model
clf_kn = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                               max_df=0.9,
                                               ngram_range=(1, 2))),
                     ('clf', kn)])

# Logging the Execution Time for each model
start_time = time.time()
clf_kn.fit(X_train, y_train)
run_time = time.time()-start_time

kn_pred = clf_kn.predict(X_val)

In [23]:
#kn metrics
F1_Macro_kn=metrics.f1_score(y_val,kn_pred,average='macro')
print("F1 Macro:")
print(F1_Macro_kn)
F1_Accuracy_kn=metrics.f1_score(y_val, kn_pred,average='micro')
print("F1 Accuracy:")
print(F1_Accuracy_kn)
F1_Weighted_kn=metrics.f1_score(y_val,kn_pred,average='weighted')
print("F1 Weighted:")
print(F1_Weighted_kn)
Execution_Time_kn=run_time
print("Executuion Time:")
print(Execution_Time_kn)

F1 Macro:
0.9669844746624089
F1 Accuracy:
0.9672727272727273
F1 Weighted:
0.9669583027167831
Executuion Time:
6.7123730182647705


MULTINOMIAL NB

In [24]:
mnb=MultinomialNB()
#fit the model
clf_mnb = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                               max_df=0.9,
                                               ngram_range=(1, 2))),
                     ('clf', mnb)])

# Logging the Execution Time for each model
start_time = time.time()
clf_mnb.fit(X_train, y_train)
run_time = time.time()-start_time

mnb_pred = clf_mnb.predict(X_val)

In [25]:
#mnb metrics
F1_Macro_mnb=metrics.f1_score(y_val,mnb_pred,average='macro')
print("F1 Macro:")
print(F1_Macro_mnb)
F1_Accuracy_mnb=metrics.f1_score(y_val, mnb_pred,average='micro')
print("F1 Accuracy:")
print(F1_Accuracy_mnb)
F1_Weighted_mnb=metrics.f1_score(y_val,mnb_pred,average='weighted')
print("F1 Weighted:")
print(F1_Weighted_mnb)
Execution_Time_mnb=run_time
print("Executuion Time:")
print(Execution_Time_mnb)

F1 Macro:
0.9984939191192189
F1 Accuracy:
0.9984848484848485
F1 Weighted:
0.9984848256743745
Executuion Time:
7.656442165374756


SGD CLASSIFIER

In [26]:
sdg=SGDClassifier(loss='hinge',penalty='l2',
                             alpha=1e-3,
                             random_state=42,
                             max_iter=5,
                             tol=None)
#fit the model
clf_sdg = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                               max_df=0.9,
                                               ngram_range=(1, 2))),
                     ('clf', sdg)])

# Logging the Execution Time for each model
start_time = time.time()
clf_sdg.fit(X_train, y_train)
run_time = time.time()-start_time

sdg_pred = clf_sdg.predict(X_val)

In [27]:
#sdg metrics
F1_Macro_sdg=metrics.f1_score(y_val,sdg_pred,average='macro')
print("F1 Macro:")
print(F1_Macro_sdg)
F1_Accuracy_sdg=metrics.f1_score(y_val, sdg_pred,average='micro')
print("F1 Accuracy:")
print(F1_Accuracy_sdg)
F1_Weighted_sdg=metrics.f1_score(y_val,sdg_pred,average='weighted')
print("F1 Weighted:")
print(F1_Weighted_sdg)
Execution_Time_sdg=run_time
print("Executuion Time:")
print(Execution_Time_sdg)

F1 Macro:
0.9867638870091128
F1 Accuracy:
0.9866666666666668
F1 Weighted:
0.986629587697399
Executuion Time:
9.397150039672852


In [28]:
# Refining the train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.01)

In [29]:
# Creating a pipeline for the gridsearch
param_grid = {'alpha': [0.1, 1, 5, 10]}  # setting parameter grid

tuned_mnb = Pipeline([('tfidf', TfidfVectorizer(min_df=2,
                                                max_df=0.9,
                                                ngram_range=(1, 2))),
                      ('mnb', GridSearchCV(MultinomialNB(),
                                           param_grid=param_grid,
                                           cv=5,
                                           n_jobs=-1,
                                           scoring='f1_weighted'))
                      ])

tuned_mnb.fit(X_train, y_train)  # Fitting the model

y_pred_mnb = tuned_mnb.predict(X_val)  # predicting the fit on validation set

print(classification_report(y_val, y_pred_mnb))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00        29
         eng       1.00      1.00      1.00        32
         nbl       1.00      1.00      1.00        29
         nso       1.00      1.00      1.00        24
         sot       1.00      1.00      1.00        36
         ssw       1.00      1.00      1.00        35
         tsn       1.00      1.00      1.00        32
         tso       1.00      1.00      1.00        24
         ven       1.00      1.00      1.00        36
         xho       1.00      1.00      1.00        30
         zul       1.00      1.00      1.00        23

    accuracy                           1.00       330
   macro avg       1.00      1.00      1.00       330
weighted avg       1.00      1.00      1.00       330



In [24]:
submission_df = pd.DataFrame(test['index'])
submission_df['lang_id'] = tuned_mnb.predict(test['text'])
submission_df.to_csv('submission2.csv', index=False)