## Import Required Libraries

In [None]:
# General libraries:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 2700)
# scikit-learn libraries:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
# NLP libraries:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem.porter import *

## Constants

In [None]:
# General:
PATH = '/'
MODEL_LANG_AR = "arabic"
MODEL_LANG_EN = "english"

# Model related:
SPLIT_PERC = 0.25
SPLIT_RANDOM_STATE = 7
LOSS = 'hinge'
PENALTY = 'l2' | 'l1' | 'elasticnet'
ALPHA = 1e-3
MODEL_RANDOM_STATE = 7
MAX_ITER = 20

## Read Data

In [None]:
data = pd.read_csv(PATH)

## Preprocess Data

In [None]:
"""
Label languages:
"""
def isEnglish(s):
    s = str(s)
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return "ar"
    else:
        return "en"
"""
Remove punctuations:
"""
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─',
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞',
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f'')
    return x   
    
"""
Cleanse text (NLP): 
Note: Model language can be changed according to the language of your model :)
"""
WNL = WordNetLemmatizer()
def clean_sentence(sentence, MODEL_LANG_EN):
    sentence = str(sentence)
    nltk.download("stopwords", quiet=True) 
    stemmer = PorterStemmer()
    sentence.lower() # Convert to lower case
    words = [word.lower() for word in word_tokenize(sentence)]
    words = [WNL.lemmatize(word) for word in words] # lemmatized_words
    words = [w for w in words if w not in stopwords.words(MODEL_LANG_EN)] # Remove stopwords 
    words = [w for w in words if "@" not in w]    
    sent = ' '.join(str(x) for x in words) 
    return sent    


data['lang'] = data['column_name'].apply(lambda x: isEnglish(x))
data['Name'] = data['Name'].apply(lambda x: clean_text(x))
data['column_name'] = data['column_name'].apply(lambda x: clean_sentence(x, MODEL_LANG_EN))

## Build Models

In [None]:
"""
Encode data labels:
"""
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])
y = data['label'].values
X = data['column_name']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= SPLIT_PERC, random_state= SPLIT_RANDOM_STATE)

"""
Build & train model
Note: you can use any model other than SGDClassifier
"""
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss=LOSS, penalty= PENALTY,alpha= ALPHA, random_state= MODEL_RANDOM_STATE, max_iter= MAX_ITER, tol=None)),
               ])
sgd.fit(X_train, y_train)


y_pred = sgd.predict(X_test) 
score = sgd.score(X_test, y_test) * 100
print('accuracy %s',score)
