In [1]:

# Packages for data analysis
import pandas as pd
import numpy as np
import time

# Packages for visualizations
import seaborn as sns
import matplotlib.style as style

# Packages for preprocessing
import nltk
import string
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

# Packages for training models
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
import xgboost as xgb

# Packages for training models
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
import xgboost as xgb


from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

import time


# Model Evaluation Packages
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt
%matplotlib inline

# Style
sns.set(font_scale=1.5)
style.use('seaborn-pastel')
style.use('seaborn-poster')

In [13]:
nltk.download('vader_lexicon')

In [2]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
# importing the dataset
train = pd.read_csv('../input/south-african-language-identification-2021/train_set.csv')
test = pd.read_csv('../input/south-african-language-identification-2021/test_set.csv')
sample_submission = pd.read_csv('../input/south-african-language-identification-2021/sample_submission.csv')

In [4]:
print(train['text'].head(7))

In [5]:
test.head(7)

In [6]:
sample_submission.head()

In [7]:
train.lang_id.value_counts()

In [8]:
# Taking general overview at both datasets
print('TRAINING DATA')
print('============='+('\n'))
print('Shape of the dataset: {}\n'.format(train.shape))
print('Total Number of unique tweets: {}\n'.format(len(set(train['text']))))
print('Total Number of missing values:\n{}\n\n'.format(train.isnull().sum()))
print('TEST DATA')
print('========='+('\n'))
print('Shape of the dataset: {}\n'.format(test.shape))
print('Total Number of unique tweets: {}\n'.format(len(set(test['text']))))
print('Total Number of missing values:\n{}\n' .format(test.isnull().sum()))

In [9]:
def clean_text(text):
    """
    This function uses regular expressions to remove html characters,
    punctuation, numbers and any extra white space from each text
    and then converts them to lowercase.

    Input:
    text: original text
          datatype: string

    Output:
    texts: modified text
           datatype: string
    """
    text=re.sub('<.*?>', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub("\n"," ",text)
    text = text.lower()
    text=' '.join(text.split())
    return text

In [10]:
# Application of the function to clean the tweets
train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

In [11]:
# Replace '.txt' with 'text file'
train["text"] = train["text"].str.replace(".txt", " text file")
test["text"] = test["text"].str.replace(".txt", " text file")

In [12]:
X = train['text']
y = train['lang_id']

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10)

In [15]:
#linear SVC model
Lsvc=LinearSVC(random_state=42)
#fit the model
clf_text = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                               max_df=0.9,
                                               ngram_range=(1, 2))),
                     ('clf', Lsvc)])

# Logging the Execution Time for each model
start_time = time.time()
clf_text.fit(X_train, y_train)
run_time = time.time()-start_time

Lsvc_pred = clf_text.predict(X_val)

In [20]:
#Lsvc metrics
F1_Macro_Lsvc=metrics.f1_score(y_val,Lsvc_pred,average='macro')
print("F1 Macro:")
print(F1_Macro_Lsvc)
F1_Accuracy_Lsvc=metrics.f1_score(y_val, Lsvc_pred,average='micro')
print("F1 Accuracy:")
print(F1_Accuracy_Lsvc)
F1_Weighted_Lsvc=metrics.f1_score(y_val,Lsvc_pred,average='weighted')
print("F1 Weighted:")
print(F1_Weighted_Lsvc)
Execution_Time_Lsvc=run_time
print("Executuion Time:")
print(Execution_Time_Lsvc)

In [21]:
lr=LogisticRegression(random_state=42,
                                  multi_class='ovr',
                                  n_jobs=1,
                                  C=1e5,
                                  max_iter=4000)
#fit the model
clf_lr = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                               max_df=0.9,
                                               ngram_range=(1, 2))),
                     ('clf', lr)])

        # Logging the Execution Time for each model
start_time = time.time()
clf_lr.fit(X_train, y_train)
run_time = time.time()-start_time

lr_pred = clf_lr.predict(X_val)

In [22]:
#Lr metrics
F1_Macro_Lr=metrics.f1_score(y_val,lr_pred,average='macro')
print("F1 Macro:")
print(F1_Macro_Lr)
F1_Accuracy_Lr=metrics.f1_score(y_val, lr_pred,average='micro')
print("F1 Accuracy:")
print(F1_Accuracy_Lr)
F1_Weighted_Lr=metrics.f1_score(y_val,lr_pred,average='weighted')
print("F1 Weighted:")
print(F1_Weighted_Lr)
Execution_Time_Lr=run_time
print("Executuion Time:")
print(Execution_Time_Lr)

In [23]:
kn=KNeighborsClassifier(n_neighbors=5)
#fit the model
clf_kn = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                               max_df=0.9,
                                               ngram_range=(1, 2))),
                     ('clf', kn)])

# Logging the Execution Time for each model
start_time = time.time()
clf_kn.fit(X_train, y_train)
run_time = time.time()-start_time

kn_pred = clf_kn.predict(X_val)

In [24]:
#kn metrics
F1_Macro_kn=metrics.f1_score(y_val,kn_pred,average='macro')
print("F1 Macro:")
print(F1_Macro_kn)
F1_Accuracy_kn=metrics.f1_score(y_val, kn_pred,average='micro')
print("F1 Accuracy:")
print(F1_Accuracy_kn)
F1_Weighted_kn=metrics.f1_score(y_val,kn_pred,average='weighted')
print("F1 Weighted:")
print(F1_Weighted_kn)
Execution_Time_kn=run_time
print("Executuion Time:")
print(Execution_Time_kn)

In [25]:
mnb=MultinomialNB()
#fit the model
clf_mnb = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                               max_df=0.9,
                                               ngram_range=(1, 2))),
                     ('clf', mnb)])

# Logging the Execution Time for each model
start_time = time.time()
clf_mnb.fit(X_train, y_train)
run_time = time.time()-start_time

mnb_pred = clf_mnb.predict(X_val)

In [27]:
#mnb metrics
F1_Macro_mnb=metrics.f1_score(y_val,mnb_pred,average='macro')
print("F1 Macro:")
print(F1_Macro_mnb)
F1_Accuracy_mnb=metrics.f1_score(y_val, mnb_pred,average='micro')
print("F1 Accuracy:")
print(F1_Accuracy_mnb)
F1_Weighted_mnb=metrics.f1_score(y_val,mnb_pred,average='weighted')
print("F1 Weighted:")
print(F1_Weighted_mnb)
Execution_Time_mnb=run_time
print("Executuion Time:")
print(Execution_Time_mnb)

In [29]:
sdg=SGDClassifier(loss='hinge',penalty='l2',
                             alpha=1e-3,
                             random_state=42,
                             max_iter=5,
                             tol=None)
#fit the model
clf_sdg = Pipeline([('tfidf', TfidfVectorizer(min_df=1,
                                               max_df=0.9,
                                               ngram_range=(1, 2))),
                     ('clf', sdg)])

# Logging the Execution Time for each model
start_time = time.time()
clf_sdg.fit(X_train, y_train)
run_time = time.time()-start_time

sdg_pred = clf_sdg.predict(X_val)

In [30]:
#sdg metrics
F1_Macro_sdg=metrics.f1_score(y_val,sdg_pred,average='macro')
print("F1 Macro:")
print(F1_Macro_sdg)
F1_Accuracy_sdg=metrics.f1_score(y_val, sdg_pred,average='micro')
print("F1 Accuracy:")
print(F1_Accuracy_sdg)
F1_Weighted_sdg=metrics.f1_score(y_val,sdg_pred,average='weighted')
print("F1 Weighted:")
print(F1_Weighted_sdg)
Execution_Time_sdg=run_time
print("Executuion Time:")
print(Execution_Time_sdg)

In [31]:
# Refining the train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.01)

In [32]:
# Creating a pipeline for the gridsearch
param_grid = {'alpha': [0.1, 1, 5, 10]}  # setting parameter grid

tuned_mnb = Pipeline([('tfidf', TfidfVectorizer(min_df=2,
                                                max_df=0.9,
                                                ngram_range=(1, 2))),
                      ('mnb', GridSearchCV(MultinomialNB(),
                                           param_grid=param_grid,
                                           cv=5,
                                           n_jobs=-1,
                                           scoring='f1_weighted'))
                      ])

tuned_mnb.fit(X_train, y_train)  # Fitting the model

y_pred_mnb = tuned_mnb.predict(X_val)  # predicting the fit on validation set

print(classification_report(y_val, y_pred_mnb))

In [24]:
submission_df = pd.DataFrame(test['index'])
submission_df['lang_id'] = tuned_mnb.predict(test['text'])
submission_df.to_csv('submission2.csv', index=False)