In [None]:
from google.colab import drive
drive.mount('/content/drive/')

### Import necessary packages

In [None]:
import csv
import json
import re

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import average_precision_score, confusion_matrix, accuracy_score, classification_report, roc_auc_score

from matplotlib import pyplot as plt

import seaborn as sns

from __future__ import unicode_literals

import warnings
warnings.filterwarnings('ignore')

### Create a list of characters to keep

In [None]:
spec_chars = {'ا', 'ب', 'پ', 'ت', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ',
              'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'و', 'ه', 'ی', '0', '1', '2', '3', '4', '5', '6', '7', '8',
              '9', '.', '؟'}

In [None]:
# Removing special characters
def rm_spec_ch(data):
    text = ''
    for i in range(len(data)):
        for j in range(len(data[i])):
            if data[i][j] not in spec_chars:
                if data[i][j] == 'ي':
                    text += 'ی'
                elif data[i][j] == 'آ' or data[i][j] == 'أ' or data[i][j] == 'إ':
                    text += 'ا'
                elif data[i][j] == 'ك':
                    text += 'ک'
                else:
                    text += ' '
            else:
                text += data[i]
    data = text
    return data


### Read the files

Reading training data and cleaning

In [None]:
title = []
category = []
text = []

csv.field_size_limit(300000)

with open('/content/drive/My Drive/Colab Notebooks/train.csv', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter='\t')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
        else:
            if row:
                title.append(rm_spec_ch(row[1]))
                category.append(row[3])
                text.append(rm_spec_ch(row[5]))
                line_count += 1
    print(f'Processed {line_count} lines.')

total_char_count = 0
total_token_count = 0
total_frequent_token_count = 0

# Decomposing into tokens (training data)
for i in range(len(text)):
    text[i] = text[i].replace('.', ' . ')
    text[i] = text[i].replace('؟', ' ؟ ')
    text[i] = re.split(' ', text[i])
    while '' in text[i]:
        text[i].remove('')
    for j in range(len(text[i])):
        if str.isnumeric(text[i][j]):
            text[i][j] = 'N'

    total_token_count += len(text[i])

unigram = dict()

# Populate 1-gram dictionary
for i in range(len(text)):
    for j in range(len(text[i])):
        if text[i][j] in unigram:
            unigram[text[i][j]] += 1
        else:
            # Start a new entry with 1 count since saw it for the first time.
            unigram[text[i][j]] = 1

        total_char_count += text[i][j].__len__()

average_news_length = total_token_count // text.__len__()

# Turn into a list of (word, count) sorted by count from most to least.
unigram = sorted(unigram.items(), key=lambda kv: kv[1], reverse=True)

# Frequent words are discovered only through the training data
top_thousands = []
freq_word_dict = dict()
file = open('most_frequent.txt', 'w', encoding='utf-8')
for i in range(10000):
    file.write(unigram[i][0] + '\n')
    top_thousands.append(unigram[i][0])
    total_frequent_token_count += unigram[i][1]
    freq_word_dict[unigram[i][0]] = unigram[i][1]
file.close()

file = open('words.txt', 'w', encoding='utf-8')
for item in unigram:
    file.write(item[0] + '\n')
file.close()

print('Total Number of Characters:\t', str(total_char_count))
print('Total Number of Words:\t\t', str(total_token_count))
print('Number of Unique Words:\t\t', str(unigram.__len__()))
print('Proportion of Frequent Words:\t %', str((total_frequent_token_count / total_token_count) * 100))
print('Average Length of News:\t\t', str(average_news_length), 'Words')

# Replacing least frequent words with "UNK"
# Using a dictionary to access elements in O(1).
for i in range(len(text)):
    for j in range(len(text[i])):
        if text[i][j] not in freq_word_dict:
            text[i][j] = 'UNK'

word2index = dict()
index2word = dict()
char2index = dict((c, i) for i, c in enumerate(spec_chars))
index2char = dict((i, c) for i, c in enumerate(spec_chars))

pickle1 = open('word2index.pickle', 'w', encoding='utf-8')
pickle2 = open('index2word.pickle', 'w', encoding='utf-8')
pickle3 = open('char2index.pickle', 'w', encoding='utf-8')
pickle4 = open('index2char.pickle', 'w', encoding='utf-8')

word2index['UNK'] = 0
index2word[0] = 'UNK'

for index in range(len(unigram)):
    word = unigram[index][0]
    word2index[word] = index + 1
    index2word[index + 1] = word

pickle1.write(json.dumps(word2index))
pickle2.write(json.dumps(index2word))
pickle3.write(json.dumps(char2index))
pickle4.write(json.dumps(index2char))

pickle1.close()
pickle2.close()
pickle3.close()
pickle4.close()

In [None]:
category2code = dict()
code2category = dict()

index = 0
for i in range(len(category)):
    if category[i] != '' and category[i] != 'category' and category[i] not in category2code:
        category2code[category[i]] = index
        code2category[index] = category[i]
        index += 1

In [None]:
filtered_title = []
filtered_category = []
filtered_text = []
for i in range(len(text)):
    if text[i].__len__() <= average_news_length:
        for j in range(average_news_length - len(text[i])):
            text[i].append('PAD')
        filtered_title.append(title[i])
        filtered_category.append(category2code.get(category[i]))
        filtered_text.append(text[i])

title = filtered_title
category = filtered_category
text = filtered_text

## Create corresponding *DataFrame*

In [None]:
df = pd.DataFrame(list(zip(title, category, text)), columns=['Title', 'Category', 'Text'])
df['Text'] = [" ".join(news) for news in df['Text'].values]
df['Category'][np.isnan(df['Category'])] = 10
# df['Category'][np.isnan(df['Category'])] = np.median(df['Category'][~np.isnan(df['Category'])])

## Split data into sets of train and validation
%70 --> train

%30 --> validation

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(df['Text'], df['Category'], test_size=0.3)

## Vectorizing the tokens

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

## Create *TF-IDF* matrix

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

## Multinomial Naïve Bayes Classifier

In [None]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
y_predicted = clf.predict(count_vect.transform(X_validation))

#### Summerize into a pipeline

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())
                    ])
text_clf = text_clf.fit(X_train, y_train)

In [None]:
y_predicted = text_clf.predict(X_validation)

In [None]:
print('Confusion Matrix:\n', confusion_matrix(y_validation, y_predicted))
print('Classification Report:\n', classification_report(y_validation, y_predicted))
print('Accuracy Score:', accuracy_score(y_validation, y_predicted))

## Support Vector Machine Classifier

In [None]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42))
                     ])
_ = text_clf_svm.fit(X_train, y_train)

In [None]:
predicted_svm = text_clf_svm.predict(X_validation)
y_score = text_clf_svm.decision_function(X_validation)

In [None]:
print('Confusion Matrix:\n', confusion_matrix(y_validation, predicted_svm))
print('Classification Report:\n', classification_report(y_validation, predicted_svm))
print('Accuracy Score:', accuracy_score(y_validation, predicted_svm))

## Grid Search
Find the optimum model parameters

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3)
             }
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)
print('Best Score:', gs_clf.best_score_)
print('Best Parameters:', gs_clf.best_params_)

## Confusion matrix & Heatmap

In [None]:
conf_mat = confusion_matrix(y_validation, y_predicted)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

## Read test data
Reading testing data and cleaning

In [None]:
title = []
category = []
text = []

csv.field_size_limit(300000)

with open('/content/drive/My Drive/Colab Notebooks/test.csv', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter='\t')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
        else:
            if row:
                title.append(rm_spec_ch(row[1]))
                category.append(row[3])
                text.append(rm_spec_ch(row[5]))
                line_count += 1
    print(f'Processed {line_count} lines.')

total_char_count = 0
total_token_count = 0
total_frequent_token_count = 0

# Decomposing into tokens (testing data)
for i in range(len(text)):
    text[i] = text[i].replace('.', ' . ')
    text[i] = text[i].replace('؟', ' ؟ ')
    text[i] = re.split(' ', text[i])
    while '' in text[i]:
        text[i].remove('')
    for j in range(len(text[i])):
        if str.isnumeric(text[i][j]):
            text[i][j] = 'N'

# Replacing least frequent words with "UNK"
# Using a dictionary to access elements in O(1).
for i in range(len(text)):
    for j in range(len(text[i])):
        if text[i][j] not in freq_word_dict:
            text[i][j] = 'UNK'

filtered_title = []
filtered_category = []
filtered_text = []
for i in range(len(text)):
    if text[i].__len__() <= average_news_length:
        for j in range(average_news_length - len(text[i])):
            text[i].append('PAD')
        filtered_title.append(title[i])
        filtered_category.append(category2code.get(category[i]))
        filtered_text.append(text[i])

title = filtered_title
category = filtered_category
text = filtered_text

## Create corresponding *DataFrame*

In [None]:
df_test = pd.DataFrame(list(zip(title, category, text)), columns=['Title', 'Category', 'Text'])
df_test['Text'] = [" ".join(news) for news in df_test['Text'].values]
df_test['Category'][np.isnan(df_test['Category'])] = 10

## Test on TEST data

### Naïve Bayes

In [None]:
y_predicted = clf.predict(count_vect.transform(df_test['Text']))

In [None]:
print('Confusion Matrix:\n', confusion_matrix(df_test['Category'], y_predicted))
print('Classification Report:\n', classification_report(df_test['Category'], y_predicted))
print('Accuracy Score:', accuracy_score(df_test['Category'], y_predicted))

### SVM

In [None]:
predicted_svm = text_clf_svm.predict(df_test['Text'])
y_score = text_clf_svm.decision_function(df_test['Text'])

In [None]:
print('Confusion Matrix:\n', confusion_matrix(df_test['Category'], predicted_svm))
print('Classification Report:\n', classification_report(df_test['Category'], predicted_svm))
print('Accuracy Score:', accuracy_score(df_test['Category'], predicted_svm))