Authors: Amirhossein Ghadami 

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, Normalizer
from stop_words import get_stop_words
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,confusion_matrix 
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import re
from nltk import pos_tag
import string
import nltk

In [2]:
#Importing data
def import_data(direction):

    try:
        with open(direction ,'r',encoding='utf8') as file:
            train = file.readlines()
            
    except:
        print('wrong direction')

    
            
    return train   
    

In [3]:
# This function does two jobs {1-tokenization    2-making corpus}

def change(x ,tokenization = True):
    
    if tokenization == True:
        
        list = []
        for i in x:
            item = i.strip().split()
            list.append(item)
            

        return list
    
    else :
        string = []
        for i in x:
            str_list = []
            str_list = ' '.join(i)
            string.append(str_list)
            
        return string 

In [4]:
#Split data into label and samples 

def splitting_data(lst):
    
    labels=[]
    samples = []
    for i in lst:
        
        splited_item = i.split(',')[0]
        samples.append(i.split(',')[1:])
        string_label = re.findall('[1-4]',splited_item)[0]
        labels.append(int(string_label))
    
    samples = change(samples ,tokenization=False)
    
    return samples,labels

In [5]:
#remove stop words

def remove_stop_words(lst):
    
    stop_words = get_stop_words('en')
    for i in lst:
        lent = len(i)
        j = 0 
        while j<lent:
            if i[j].lower() in stop_words:
                    del i[j]
                    lent -=1
                    j-=1
            j+=1
            
    return lst          

In [6]:
# normalization

def stemmer(lst):
    st = PorterStemmer()
    samples_stemm=[]
    
    for i in lst:
        b=[]
        for j in i:
            b.append(st.stem(j))
            
        samples_stemm.append(b)        
            
    return samples_stemm       

In [7]:
#normalization

def lemmatizing(lst):
    lem = WordNetLemmatizer()
    
    samples_lem =[]
    for i in lst:
        c =[]
        for j in i:
            c.append(lem.lemmatize(j))
        samples_lem.append(c)
    return samples_lem     

In [8]:
#TF_IDF

def tf_idf(lst1):

    vectorizer = TfidfVectorizer()
    vectorizer.fit(lst1)
    list1      = vectorizer.transform(lst1)

    

    
    return list1

In [55]:
# Part Of Speech

def find_part_of_speech(lst):
    possition =[]
    for i in lst :
        item  = pos_tag(i)
        possition.append(item)
        
    return possition    

In [63]:
#This function extracs verbs and adjs

def extract_adj_noun(lst_of_tuple):
    extracted = []
    for i in lst_of_tuple:
        item = [j[0]  for j in i if j[1]=='VB'or j[1]=='JJ']
        extracted.append(item)
        
    return extracted    

In [34]:
# change to numpy

def sparce_to_numpy(sparce_matrix):
    
    df=pd.DataFrame.sparse.from_spmatrix(sparce_matrix)
    array=df.to_numpy()
    
    return array

In [35]:
def mixed_two_array(arr1,arr2):
    return np.concatenate((arr1,arr2),axis=1)

In [13]:
def preprocessing_data(direction):
    
    train                        = import_data(direction)                    # importing data
    
    samples , labels             = splitting_data(train[:10000])             # splitting Data
    
    samples_tokenize             = change(samples)                           # Tokenization step
    
    samples_stopRemove           = remove_stop_words(samples_tokenize)       # removing Junck words 
    
    samples_lemma                = lemmatizing(samples_stopRemove)           # lemmatization Step

    samples_stemm                = stemmer(samples_lemma)                    # Stemming Step
    
    corpus_train                 = change(samples_lemma,tokenization=False)  # corpuse for each row
    
    samples_tf_idf               = tf_idf(corpus_train)                      #TF _ IDF
    
    samples_ready                = sparce_to_numpy(samples_tf_idf)           # change to numpy array
    
    labels_ready                 = labels
    
    
    #---------------------------part of speech
    # by This part we extract adj and verbs and add it to the train TF_IDF dataset, to emphesis the adj and verbs 
    
    
    list_of_tuple                = find_part_of_speech(samples_stemm )  
    
    extracted_words              = extract_adj_noun(list_of_tuple)          
    
    corpus_part_speech           = change(extracted_words,tokenization=False) 
    
    tf_idf_part_speech           = tf_idf(corpus_part_speech)
    
    part_speech_ready            = sparce_to_numpy(tf_idf_part_speech)
    
    
    #---------------------------mixed
    
    mixed_samples                = mixed_two_array(samples_ready,part_speech_ready)
    
    
    x_train , x_test ,y_train ,y_test = train_test_split(mixed_samples ,labels_ready , random_state = 2 ,test_size=0.2)
    
    return x_train , x_test ,y_train ,y_test
    

In [14]:
def train_test_logistic_model(x_train,x_test,y_train,y_test):
    
    classifier= LogisticRegression()                                       #make a classifier object                      
    classifier.fit(x_train , y_train)                                            
    
    y_predict = classifier.predict(x_test)
    accuracy  = accuracy_score(y_predict , y_test) 

    return accuracy

In [15]:
def train_test_nn_model(x_train,x_test,y_train,y_test):

    
    nn = MLPClassifier(early_stopping=True,hidden_layer_sizes=(100, ),activation='logistic',random_state=0)
    nn.fit(x_train,y_train)
    
    y_predict = nn.predict(x_test)
    accuracy  = accuracy_score(y_predict , y_test) 
        
    return accuracy

In [16]:
x_train , x_test ,y_train ,y_test = preprocessing_data('train.csv')

In [17]:
train_test_nn_model(x_train,x_test,y_train,y_test)

0.8875

In [18]:
train_test_logistic_model(x_train,x_test,y_train,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.885