Import Necessary Libraries 

In [173]:
!pip install parsivar



In [174]:
!pip install langdetect



In [175]:
!pip install finglish



In [176]:
import pandas as pd
from finglish import f2p
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from google.colab import drive
import os
import sys
from IPython.display import clear_output
from langdetect import detect, DetectorFactory
from parsivar import Normalizer
from parsivar import FindStems
from parsivar import Tokenizer
from parsivar import POSTagger
from string import punctuation
import gensim
import seaborn as sns
import statistics
from collections import Counter
import math
from math import inf
import pickle
import json
import ast
import re
import string
from math import log2

In [177]:
# you should update this cell according to your settings

#  test_data
#       | text_file_1.txt
#       | text_file_2.txt
#       | text_file_3.txt
#       .............
#       | text_file_n.txt


config = {
    "current_working_dir" : "/content/drive/MyDrive/NLP-Spring 99-00/HW1/",
    "test_data_path" : "/content/drive/MyDrive/NLP-Spring 99-00/HW1/test/", #location of test files
    "model_name" : "model.svm"
}

Mount Drive

In [178]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [179]:
normalizer = Normalizer()
tokenizer = Tokenizer()
stemmer = FindStems()
tagger = POSTagger(tagging_model="stanford")
alphabet = ""
with open(config["current_working_dir"] + "persian_alphabet.txt", 'r' , encoding='utf-8') as alphabet_file:
    alphabet = alphabet_file.read().replace("\n" , "").replace("\t" , "")
numbers = "0123456789"

Read Files

In [180]:
def read_files(folderpath , gender):
    data = pd.DataFrame(columns = ['file_path' , 'file_text' , 'gender'])
    list_of_all_files = os.listdir(folderpath)
    for filename in list_of_all_files:
        current_file_path = folderpath + "/" + filename
        current_file_text = ""
        with open(current_file_path , 'r' , encoding='utf-8') as current_file:
             current_file_text = current_file.read()
        
        data = data.append({"file_path" : current_file_path , "file_text" : current_file_text , "gender" : gender}, ignore_index=True)
    return data

In [183]:
female_folder = config['test_data_path'] + "female"
male_folder = config['test_data_path'] + "male"
data = pd.DataFrame(columns = ['file_path' , 'file_text' , 'gender'])
data = data.append(read_files(female_folder , 'f'),ignore_index=True)
data = data.append(read_files(male_folder , 'm'),ignore_index=True)

In [184]:
data.head()

Unnamed: 0,file_path,file_text,gender
0,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,سلام به همه دوستان \nما هتل ارم 5 ستاره رزرو ک...,f
1,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,26/12/89\nهتل تمیزی بود با منظره واقعا عالی رو...,f
2,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,ما فقط 3 روز در این هتل اقامت داشتیم و خیلی را...,f
3,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,من در ابان 89 به هتل داریوش رفتم فضای هتل بسیا...,f
4,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,سلام...من 1 ماه پیش به همراه خانوادم به کیش(هت...,f


In [185]:
data['normalized_text'] = data['file_text'].apply(lambda text : normalizer.normalize(text))

In [186]:
data['cleaned_number_text'] = data['normalized_text'].apply(lambda text : re.sub(r'\d+', '', text))

In [187]:
data['cleaned_punc_text'] = data['cleaned_number_text'].apply(lambda text : text.translate(str.maketrans('','', string.punctuation)))

In [188]:
data['tokenized_sentence'] = data['normalized_text'].apply(lambda text : tokenizer.tokenize_sentences(text))

In [189]:
def tokenize_words(text):
    if detect(text) != 'fa' and detect(text) != 'ar':
        #convert finglish words to farsi
        tokens = list(map(lambda word : f2p(word) , tokenizer.tokenize_words(text)))
        return tokens
    else:
        return tokenizer.tokenize_words(text)
data['tokenized_word'] = data['cleaned_punc_text'].apply(lambda text : tokenize_words(text))

In [190]:
data.head()

Unnamed: 0,file_path,file_text,gender,normalized_text,cleaned_number_text,cleaned_punc_text,tokenized_sentence,tokenized_word
0,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,سلام به همه دوستان \nما هتل ارم 5 ستاره رزرو ک...,f,سلام به همه دوستان \nما هتل ارم 5 ستاره رزرو ک...,سلام به همه دوستان \nما هتل ارم ستاره رزرو کر...,سلام به همه دوستان \nما هتل ارم ستاره رزرو کر...,"[سلام به همه دوستان , ما هتل ارم 5 ستاره رزرو...","[سلام, به, همه, دوستان, ما, هتل, ارم, ستاره, ر..."
1,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,26/12/89\nهتل تمیزی بود با منظره واقعا عالی رو...,f,26 / 12 / 89\nهتل تمیزی‌بود با منظره واقعا عال...,/ / \nهتل تمیزی‌بود با منظره واقعا عالی رو ب...,\nهتل تمیزی‌بود با منظره واقعا عالی رو به ...,"[26 / 12 / 89 , هتل تمیزی‌بود با منظره واقعا ع...","[هتل, تمیزی‌بود, با, منظره, واقعا, عالی, رو, ب..."
2,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,ما فقط 3 روز در این هتل اقامت داشتیم و خیلی را...,f,ما فقط 3 روز در این هتل اقامت داشتیم و خیلی را...,ما فقط روز در این هتل اقامت داشتیم و خیلی راض...,ما فقط روز در این هتل اقامت داشتیم و خیلی راض...,[ما فقط 3 روز در این هتل اقامت داشتیم و خیلی ر...,"[ما, فقط, روز, در, این, هتل, اقامت, داشتیم, و,..."
3,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,من در ابان 89 به هتل داریوش رفتم فضای هتل بسیا...,f,من در ابان 89 به هتل داریوش رفتم فضای هتل بسیا...,من در ابان به هتل داریوش رفتم فضای هتل بسیار ...,من در ابان به هتل داریوش رفتم فضای هتل بسیار ...,[من در ابان 89 به هتل داریوش رفتم فضای هتل بسی...,"[من, در, ابان, به, هتل, داریوش, رفتم, فضای, هت..."
4,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,سلام...من 1 ماه پیش به همراه خانوادم به کیش(هت...,f,سلام ... من 1 ماه پیش به همراه خانوادم به کیش ...,سلام ... من ماه پیش به همراه خانوادم به کیش (...,سلام من ماه پیش به همراه خانوادم به کیش هتل...,"[سلام ..., من 1 ماه پیش به همراه خانوادم به ...","[سلام, من, ماه, پیش, به, همراه, خانوادم, به, ک..."


Character Based Features

In [191]:
#length of document
C = data['file_text'].apply(lambda text : len(text))

In [192]:
data['F1'] = C

In [193]:
def count_alphabet(text):
    count = 0
    for ch in text:
        if ch in alphabet:
            count = count + 1
    return count

In [194]:
#number of alphabet characters in each document
data['F2'] = data['normalized_text'].apply(lambda text : count_alphabet(text)) / C

In [195]:
def count_digits(text):
    count = 0
    for ch in text:
        if ch in numbers:
            count = count + 1
    return count

In [196]:
#number of digital characters in each document
data['F3'] = data['normalized_text'].apply(lambda text : count_digits(text)) / C

In [197]:
#number of white space characters in each document
data['F4'] = data['file_text'].apply(lambda text : text.count(" ")) / C

In [198]:
#number of tab characters in each document
data['F5'] = data['file_text'].apply(lambda text : text.count("\t")) / C

In [199]:
def count_special_chars(text):
    count = 0
    for ch in text:
        if ch in punctuation:
            count = count + 1
    return count

In [200]:
#number of digital characters in each document
data['F6'] = data['file_text'].apply(lambda text : count_special_chars(text)) / C

Word Based Features

In [201]:
#number of words in each document
N = data['tokenized_word'].apply(lambda lst : len(lst))
data['N'] = N

In [202]:
data['F7'] = N

In [203]:
#mean of words length in each document
data['F8'] = data['tokenized_word'].apply(lambda lst : statistics.mean(list(map(lambda x : len(x) , lst))))

In [204]:
#number of unique words in each document
V = data['tokenized_word'].apply(lambda lst : len(set(lst)))
data['V'] = V
data['F9'] = V/N

In [205]:
#number of words longer than 4 characters
data['F10'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if len(x) >= 5))/N

In [206]:
#number of words shhorter than 4 characters
data['F11'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if len(x) <= 3))/N

In [207]:
def get_word_by_count(list_of_words , count):
    dict_count = Counter(list_of_words)
    words = []
    for key,value in dict_count.items():
        if value == count:
            words.append(key)
    return len(words)

In [208]:
#Hapax Legomena
data['F12'] = data['tokenized_word'].apply(lambda lst : get_word_by_count(lst , 1))/N

In [209]:
#Hapax Dislegomena
data['F13'] = data['tokenized_word'].apply(lambda lst : get_word_by_count(lst , 2))/N

In [210]:
def Yule_K(v,n,list_of_words):
    val = -1/n
    for i in range(1,v+1):
        val = val + get_word_by_count(list_of_words,i)*(i/n)**2
    val = val * 10**4
    return val

In [211]:
def Simpsons(v,n,list_of_words):
    val = 0
    for i in range(1,v+1):
        try:
            val = val + get_word_by_count(list_of_words,i)*(i/n)*(i-1)/(n-1)
        except ZeroDivisionError:
            return None
    return val

In [212]:
def Sichel(v,list_of_words):
    val = get_word_by_count(list_of_words,2)/v
    return val

In [213]:
def Honores(v,n,list_of_words):
    try:
        val = 100 * (math.log10(n)) / (1 - get_word_by_count(list_of_words,1) / v)
    except ZeroDivisionError:
        return None
    return val

In [214]:
data['F14'] = data.apply(lambda row : Yule_K(row['V'] , row['N'] , row['tokenized_word']) , axis=1)
data['F15'] = data.apply(lambda row : Simpsons(row['V'] , row['N'] , row['tokenized_word']) , axis=1)
data['F16'] = data.apply(lambda row : Sichel(row['V'] , row['tokenized_word']) , axis=1)
data['F17'] = data.apply(lambda row : Honores(row['V'] , row['N'] , row['tokenized_word']) , axis=1)

In [325]:
data['F14'].interpolate(method ='linear', limit_direction ='forward' , inplace=True)
data['F15'].interpolate(method ='linear', limit_direction ='forward' , inplace=True)
data['F16'].interpolate(method ='linear', limit_direction ='forward' , inplace=True)
data['F17'].interpolate(method ='linear', limit_direction ='forward' , inplace=True)

In [216]:
def entropy(list_of_words):
    dict_count = Counter(list_of_words)
    total = sum(dict_count.values())
    return sum(freq / total * log2(total / freq) for freq in dict_count.values())

data['F18'] = data['tokenized_word'].apply(lambda lst : entropy(lst))

In [217]:
def get_count_by_len(list_of_words , length):
    count_list = list(map(lambda x : len(x) , list_of_words))
    dict_count = Counter(count_list)
    return dict_count[length]

# word length histogram
#init F19 to F38
for i in range(1,21):
    feature_name = "F" + str(i + 18)
    data[feature_name] = data['tokenized_word'].apply(lambda lst : get_count_by_len(lst , i))

In [218]:
with open(config['current_working_dir'] + 'Lexicon_Positive.pickle', 'rb') as file:
    lexicon_positive = [normalizer.normalize(x) for x in pickle.load(file)]

In [219]:
#number of positive words in each document
data['F39'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in lexicon_positive))

In [220]:
with open(config['current_working_dir'] + 'Lexicon_Negative.pickle', 'rb') as file:
    lexicon_negative = [normalizer.normalize(x) for x in pickle.load(file)]

In [221]:
#number of negative words in each document
data['F40'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in lexicon_negative))

In [222]:
with open(config['current_working_dir'] + 'Lexicon_Anger.pickle', 'rb') as file:
    lexicon_anger = [normalizer.normalize(x) for x in pickle.load(file)]

In [223]:
#number of anger words in each document
data['F41'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in lexicon_anger))

In [224]:
with open(config['current_working_dir'] + 'Lexicon_Anticipation.pickle', 'rb') as file:
    lexicon_anticipation = [normalizer.normalize(x) for x in pickle.load(file)]

In [225]:
#number of anticipation words in each document
data['F42'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in lexicon_anticipation))

In [226]:
with open(config['current_working_dir'] + 'Lexicon_Disgust.pickle', 'rb') as file:
    lexicon_disgust = [normalizer.normalize(x) for x in pickle.load(file)]

In [227]:
#number of disgusting words in each document
data['F43'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in lexicon_disgust))

In [228]:
with open(config['current_working_dir'] + 'Lexicon_Fear.pickle', 'rb') as file:
    lexicon_fear = [normalizer.normalize(x) for x in pickle.load(file)]

In [229]:
#number of fearnig words in each document
data['F44'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in lexicon_fear))

In [230]:
with open(config['current_working_dir'] + 'Lexicon_Joy.pickle', 'rb') as file:
    lexicon_joy = [normalizer.normalize(x) for x in pickle.load(file)]

In [231]:
#number of joy words in each document
data['F45'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in lexicon_joy))

In [232]:
with open(config['current_working_dir'] + 'Lexicon_Sadness.pickle', 'rb') as file:
    lexicon_sadness = [normalizer.normalize(x) for x in pickle.load(file)]

In [233]:
#number of sad words in each document
data['F46'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in lexicon_sadness))

In [234]:
with open(config['current_working_dir'] + 'Lexicon_Surprise.pickle', 'rb') as file:
    lexicon_surprise = [normalizer.normalize(x) for x in pickle.load(file)]

In [235]:
#number of surprising words in each document
data['F47'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in lexicon_surprise))

In [236]:
with open(config['current_working_dir'] + 'Lexicon_Trust.pickle', 'rb') as file:
    lexicon_trust = [normalizer.normalize(x) for x in pickle.load(file)]

In [237]:
#number of trusting words in each document
data['F48'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in lexicon_trust))

Syntax Based Features

In [238]:
data['F49'] = data['normalized_text'].apply(lambda text : text.count("'"))/C

In [239]:
data['F50'] = data['normalized_text'].apply(lambda text : text.count("،"))/C

In [240]:
data['F51'] = data['normalized_text'].apply(lambda text : text.count("."))/C

In [241]:
data['F52'] = data['normalized_text'].apply(lambda text : text.count(":"))/C

In [242]:
data['F53'] = data['normalized_text'].apply(lambda text : text.count(";"))/C

In [243]:
data['F54'] = data['normalized_text'].apply(lambda text : text.count("؟"))/C

In [244]:
data['F55'] = data['normalized_text'].apply(lambda text : len(re.findall("؟؟+" , text)))/C

In [245]:
data['F56'] = data['normalized_text'].apply(lambda text : text.count("!"))/C

In [246]:
data['F57'] = data['normalized_text'].apply(lambda text : len(re.findall("!!+" , text)))/C

In [247]:
data['F58'] = data['normalized_text'].apply(lambda text : len(re.findall("(\.)(\.)+" , text)))/C

Structural Features

In [248]:
#number of document lines
data['F59'] = data['file_text'].apply(lambda text : text.count("\n"))

In [249]:
#number of sentences in each document
S = data['tokenized_sentence'].apply(lambda lst : len(lst))
data['F60'] = S

In [250]:
#average number of words in each sentence
data['F61'] = data['tokenized_sentence'].apply(lambda lst : statistics.mean(list(map(lambda x : len(tokenizer.tokenize_words(x)) , lst))))

In [251]:
def ratio_of_empty_lines(text):
    all_lines = text.split("\n")
    number_of_empty_lines = 0
    for line in all_lines:
        if len(line.strip()) == 0:
            number_of_empty_lines = number_of_empty_lines + 1
    return number_of_empty_lines / len(all_lines)

In [252]:
data['F62'] = data['file_text'].apply(lambda text : ratio_of_empty_lines(text))

In [253]:
def avg_length_of_nonempty_lines(text):
    all_lines = text.split("\n")
    length_of_non_empty_lines = 0
    number_of_non_empty_lines = 0
    for line in all_lines:
        if len(line.strip()) != 0:
            number_of_non_empty_lines = number_of_non_empty_lines + 1
            length_of_non_empty_lines = length_of_non_empty_lines + len(line)
    try:
        return length_of_non_empty_lines / number_of_non_empty_lines
    except ZeroDivisionError:
        return 0

In [254]:
data['F63'] = data['file_text'].apply(lambda text : avg_length_of_nonempty_lines(text))

Grammer Related Features

In [255]:
#gender specific words
female_words = ["شوهر" , "شوهرم"]
male_words = ["خانمم" , "زنم"]
data['F64'] = data['cleaned_punc_text'].apply(lambda text : sum([1 for word in female_words if word in text])).astype(int)
data['F65'] = data['cleaned_punc_text'].apply(lambda text : sum([1 for word in male_words if word in text])).astype(int)

In [256]:
with open(config['current_working_dir'] + 'interrogative.pickle', 'rb') as file:
    list_of_all_interrogatives = pickle.load(file)

In [257]:
data['F66'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in list_of_all_interrogatives))/N

In [258]:
with open(config['current_working_dir'] + 'conjunctions.pickle', 'rb') as file:
    list_of_all_conjunctions = pickle.load(file)

In [259]:
data['F67'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in list_of_all_conjunctions))/N

In [260]:
with open(config['current_working_dir'] + 'interjections.pickle', 'rb') as file:
    list_of_all_interjections = pickle.load(file)

In [261]:
data['F68'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in list_of_all_interjections))/N

In [262]:
#Part Of Speech Tagger
tags = pd.Series()
for index, row in data.iterrows():
    #delete tagger and re initialize it on every 20 iterations for avoiding java head error
    if index % 20 == 0:
        del tagger
        tagger = POSTagger(tagging_model="stanford")

    clear_output()
    print(str(index))
    tags = tags.append(pd.Series([tagger.parse(row['tokenized_word'])]), ignore_index=True)

109


In [263]:
#number of propositions in each document
data['F69'] = tags.apply(lambda lst : sum(1 for tag in lst if tag[1] == 'PO')) / N

Linguistic-Psychological Features

In [264]:
polarity_df = pd.read_csv(config['current_working_dir'] + 'PersianSWN.csv' , sep='\t' , names = ["id" , "word" , "conf" , "plus" , "minus"] ,header=None)
polarity_df.drop("id" , axis=1 , inplace=True)
polarity_df["plus_prob"] = polarity_df["conf"] * polarity_df["plus"]
polarity_df["minus_prob"] = polarity_df["conf"] * polarity_df["minus"]
polarity_df.drop("plus" , axis=1 , inplace=True)
polarity_df.drop("minus" , axis=1 , inplace=True)

In [265]:
ploar_plus = polarity_df.groupby("word")["plus_prob"].mean().reset_index()
ploar_minus = polarity_df.groupby("word")["minus_prob"].mean().reset_index()
word_polatiry = pd.merge(ploar_plus , ploar_minus , how="inner" , on="word")

In [266]:
good_words = word_polatiry[(word_polatiry["plus_prob"] > word_polatiry["minus_prob"] + 0.15) & (word_polatiry["plus_prob"] > 0.2)]["word"].unique()
bad_words = word_polatiry[(word_polatiry["minus_prob"] > word_polatiry["plus_prob"] + 0.15) & (word_polatiry["minus_prob"] > 0.2)]["word"].unique()

In [267]:
good_words = list(map(lambda x : normalizer.normalize(x) , good_words))
bad_words = list(map(lambda x : normalizer.normalize(x) , bad_words))

In [268]:
data['F70'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in good_words))

In [269]:
data['F71'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in bad_words))

In [270]:
with open(config['current_working_dir'] + 'colors.pickle', 'rb') as file:
    list_of_all_colors = pickle.load(file)

In [271]:
#number of colors in each document
data['F72'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in list_of_all_colors))

In [272]:
#number of adjectives in each document
data['F73'] = tags.apply(lambda lst : sum(1 for tag in lst if tag[1] == 'ADJ'))

In [273]:
#number of adverbs in each document
data['F74'] = tags.apply(lambda lst : sum(1 for tag in lst if tag[1] == 'ADV'))

In [274]:
#number of pronouns in each document
data['F75'] = tags.apply(lambda lst : sum(1 for tag in lst if tag[1] == 'PRO'))

In [275]:
with open(config['current_working_dir'] + 'doubt.pickle', 'rb') as file:
    list_of_all_doubt = pickle.load(file)

In [276]:
#number of doubt words in each document
data['F76'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in list_of_all_doubt))

In [277]:
with open(config['current_working_dir'] + 'certainty.pickle', 'rb') as file:
    list_of_all_certainties = pickle.load(file)

In [278]:
#number of certainty words in each document
data['F77'] = data['tokenized_word'].apply(lambda lst : sum(1 for x in lst if x in list_of_all_certainties))

In [279]:
#greeting
greetings = ["سلام" , "خوبید"]
data['F78'] = data['cleaned_punc_text'].apply(lambda text : sum([1 for word in greetings if word in text]) >= 1).astype(int)

In [280]:
#farewell
farewells = ["خداحافظ" , "خدانگهدار"]
data['F79'] = data['cleaned_punc_text'].apply(lambda text : sum([1 for word in farewells if word in text]) >= 1).astype(int)

In [281]:
data.tail()

Unnamed: 0,file_path,file_text,gender,normalized_text,cleaned_number_text,cleaned_punc_text,tokenized_sentence,tokenized_word,F1,F2,F3,F4,F5,F6,N,F7,F8,V,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28,F29,F30,...,F40,F41,F42,F43,F44,F45,F46,F47,F48,F49,F50,F51,F52,F53,F54,F55,F56,F57,F58,F59,F60,F61,F62,F63,F64,F65,F66,F67,F68,F69,F70,F71,F72,F73,F74,F75,F76,F77,F78,F79
105,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,من به همراه همسرم از تاریخ 29/4 لغایت 2/5 در ه...,m,من به همراه همسرم از تاریخ 29 / 4 لغایت 2 / 5 ...,من به همراه همسرم از تاریخ / لغایت / در هت...,من به همراه همسرم از تاریخ لغایت در هتل ...,[من به همراه همسرم از تاریخ 29 / 4 لغایت 2 / 5...,"[من, به, همراه, همسرم, از, تاریخ, لغایت, در, ه...",488,0.713115,0.012295,0.233607,0.0,0.028689,90,90,4.011111,70,0.777778,0.4,0.488889,0.633333,0.1,74.074074,0.007491,0.128571,1052.284428,5.972354,8,20,16,10,14,8,7,5,2,0,0,0,...,4,1,3,1,1,8,1,2,6,0.0,0.008197,0.020492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,11,9.909091,0.0,488.0,0,0,0.0,0.088889,0.0,0.1,0,1,0,14,6,2,0,0,0,0
106,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,سلام من 3 شب با دوستام اونجا اقانت داشتم \nهتل...,m,سلام من 3 شب با دوستام اونجا اقانت داشتم \nهتل...,سلام من شب با دوستام اونجا اقانت داشتم \nهتل ...,سلام من شب با دوستام اونجا اقانت داشتم \nهتل ...,"[سلام من 3 شب با دوستام اونجا اقانت داشتم , ه...","[سلام, من, شب, با, دوستام, اونجا, اقانت, داشتم...",417,0.736211,0.007194,0.223022,0.0,0.007194,79,79,4.088608,65,0.822785,0.392405,0.43038,0.746835,0.037975,99.343054,0.010062,0.046154,2055.762682,5.824359,11,12,11,14,15,8,2,2,0,2,1,0,...,2,2,3,1,2,3,1,1,1,0.0,0.014388,0.004796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,7,12.0,0.166667,82.4,0,0,0.025316,0.113924,0.012658,0.025316,1,0,1,8,1,2,0,0,1,0
107,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,علی جان انشالله که خوش گذشته باشه.گفتی ناهارش ...,m,علی جان انشالله که خوش گذشته باشه . گفتی ناهار...,علی جان انشالله که خوش گذشته باشه . گفتی ناهار...,علی جان انشالله که خوش گذشته باشه گفتی ناهارش...,"[علی جان انشالله که خوش گذشته باشه ., گفتی ن...","[علی, جان, انشالله, که, خوش, گذشته, باشه, گفتی...",270,0.796296,0.007407,0.17037,0.0,0.014815,55,55,3.963636,46,0.836364,0.381818,0.436364,0.709091,0.090909,72.727273,0.007407,0.108696,1143.66691,5.426637,3,12,9,10,8,7,6,0,0,0,0,0,...,6,1,3,2,2,3,2,2,3,0.0,0.0,0.014815,0.0,0.0,0.011111,0.0,0.0,0.0,0.0,0,7,8.571429,0.0,270.0,0,0,0.090909,0.109091,0.036364,0.072727,2,0,0,4,5,1,0,0,0,0
108,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,با سلام \nمن به همراه خانواده و یک بچه 4 ماهه ...,m,با سلام \nمن به همراه خانواده و یک بچه 4 ماهه ...,با سلام \nمن به همراه خانواده و یک بچه ماهه ا...,با سلام \nمن به همراه خانواده و یک بچه ماهه ا...,"[با سلام , من به همراه خانواده و یک بچه 4 ماه...","[با, سلام, من, به, همراه, خانواده, و, یک, بچه,...",894,0.755034,0.011186,0.225951,0.0,0.004474,184,184,3.722826,123,0.668478,0.288043,0.538043,0.548913,0.070652,119.328922,0.011998,0.105691,1266.239056,6.512932,8,51,40,32,23,13,8,6,1,1,1,0,...,8,4,6,3,5,4,4,4,12,0.0,0.0,0.002237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,5,39.0,0.0,222.75,0,0,0.076087,0.130435,0.005435,0.157609,0,1,0,15,10,6,2,2,1,0
109,/content/drive/MyDrive/NLP-Spring 99-00/HW1/te...,با سلام\nمن در اغلب هتل های 5 ستاره کیش اقامت ...,m,با سلام\nمن در اغلب هتل‌های 5 ستاره کیش اقامت ...,با سلام\nمن در اغلب هتل‌های ستاره کیش اقامت د...,با سلام\nمن در اغلب هتل‌های ستاره کیش اقامت د...,"[با سلام , من در اغلب هتل‌های 5 ستاره کیش اقام...","[با, سلام, من, در, اغلب, هتل‌های, ستاره, کیش, ...",1777,0.737198,0.010129,0.221159,0.0,0.020822,344,344,3.875,193,0.561047,0.351744,0.456395,0.418605,0.069767,98.363981,0.009865,0.124352,999.093427,7.032848,15,76,66,66,61,31,21,6,2,0,0,0,...,16,4,21,3,3,19,7,5,26,0.0,0.0,0.007316,0.001688,0.0,0.0,0.0,0.0,0.0,0.000563,18,16,24.75,0.210526,116.266667,0,0,0.040698,0.101744,0.0,0.107558,2,0,2,25,18,6,0,1,1,0


**Classification**

In [282]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sklearn.preprocessing as preprocessing
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
import sklearn.tree as tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.utils import shuffle
from sklearn import svm
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

Add word2vec features

In [283]:
model = gensim.models.KeyedVectors.load_word2vec_format(config["current_working_dir"] + "blog.vec", binary=False)

In [284]:
lst = []
for index, row in data.iterrows():
    lst_i = []
    for x in row['tokenized_word']:
        if x in model.vocab:
            lst_i.append(model.get_vector(x))
    lst.append(np.mean(lst_i , axis=0).tolist())

In [285]:
df_w2v = pd.DataFrame(lst , columns=[("W" + str(i)) for i in range(1,301)])

In [286]:
df_w2v.head()

Unnamed: 0,W1,W2,W3,W4,W5,W6,W7,W8,W9,W10,W11,W12,W13,W14,W15,W16,W17,W18,W19,W20,W21,W22,W23,W24,W25,W26,W27,W28,W29,W30,W31,W32,W33,W34,W35,W36,W37,W38,W39,W40,...,W261,W262,W263,W264,W265,W266,W267,W268,W269,W270,W271,W272,W273,W274,W275,W276,W277,W278,W279,W280,W281,W282,W283,W284,W285,W286,W287,W288,W289,W290,W291,W292,W293,W294,W295,W296,W297,W298,W299,W300
0,-0.837028,0.429852,-0.397301,0.507135,-0.483689,-0.304654,0.100059,0.23939,1.467267,0.162821,0.037705,0.263094,-0.530917,-1.027841,-0.685985,-0.42626,0.44734,-0.604823,0.342403,-0.001714,-0.647165,-0.719822,-0.313351,0.08552,0.77587,-0.176961,-0.135389,0.378429,-0.881119,0.301059,2.015589,-1.0403,-0.06611,0.457236,0.262417,-1.419943,0.857954,0.033854,0.086218,-0.574271,...,0.52781,-0.363442,-0.153143,-0.300538,1.419595,0.422219,-1.221329,0.371151,-0.769374,0.820034,-0.747883,0.298938,0.252306,-0.456154,-0.059594,-0.574855,-0.296343,-0.061679,-0.64163,0.285858,0.725458,-0.600987,-0.4878,-0.632779,0.725262,0.142824,-1.046959,-1.599883,0.379227,-0.385168,1.097876,-0.278116,1.417867,-0.212165,1.03733,-0.035591,0.068145,-1.365651,0.069496,-0.204749
1,-1.126538,0.183382,0.632392,0.873847,-0.31794,0.33396,0.01748,-0.530098,0.019864,0.511871,0.246658,-0.35945,-0.502606,-0.340385,-1.369518,-0.44406,-0.338642,-0.952501,-0.106613,-0.188989,-0.591771,0.05988,-0.484875,0.092332,0.671824,-0.078877,-0.212067,0.009146,-1.048338,1.013049,1.178863,-0.90624,0.341824,0.377284,0.355188,-0.480851,-0.106981,-0.179548,0.398808,-0.424221,...,0.268524,-0.533557,0.014364,-0.071199,-0.070335,-0.312687,-0.125489,-0.417047,-0.592849,0.564484,-0.364487,-0.61621,-0.193117,0.012343,-0.358035,-0.168552,-0.342364,0.491329,-0.055903,-0.436484,0.383252,-0.188031,0.124307,-0.132027,0.214775,0.324322,-0.268805,-0.460694,-0.370299,-0.059947,-0.309613,-0.251924,0.547093,0.184294,-0.08744,-0.008302,-0.434463,0.147714,-1.091628,-0.863928
2,-1.056388,0.364631,0.407994,0.567005,0.16768,-0.344241,0.111831,-0.864737,1.024975,0.599807,0.229437,-0.195048,-0.566,-0.659714,-0.917438,-0.540133,0.102527,-1.306747,-0.323729,-0.506899,-0.557344,-0.135743,-0.0577,-0.39505,0.918544,-0.131061,-0.075422,-0.620925,-0.933741,0.734793,1.629758,-1.083341,0.031725,0.499293,-0.157283,-0.477497,0.320367,-0.327168,0.082711,-0.658986,...,-0.125494,-0.282114,0.243828,-0.882361,0.10072,-0.172393,-0.316154,-0.416657,0.032923,0.38035,-0.022929,-0.575458,-0.657933,-0.403179,-0.544222,-0.448686,0.015321,-0.362756,-0.524644,-0.495344,0.257003,-0.460331,-0.223231,-0.339392,0.310808,-0.001843,-0.376887,-0.576887,0.471527,-0.429756,0.020131,-0.416795,0.779041,0.089726,0.059349,-0.365418,0.00844,0.193296,-0.568538,-0.189066
3,-0.804049,0.159696,-0.139803,0.916475,-0.53116,-0.349368,-0.445502,-0.709264,0.281257,0.08912,0.215151,-0.365488,-0.13863,-0.352152,-0.689133,-0.557043,-0.247966,-0.630545,-0.005945,-0.406726,-0.128759,-0.508177,-0.175644,-0.254047,0.467707,-0.193242,-0.191371,0.106971,-0.70109,0.546148,0.901071,-1.059016,0.237848,0.506297,0.101406,-0.637087,0.081808,-0.280224,-0.09584,-0.277434,...,0.130427,-0.499367,0.269804,-0.069709,0.123557,0.044398,-0.233977,0.071176,-0.426246,0.169304,-0.133437,-0.43217,0.022572,0.426678,-0.145252,-0.403,-0.18208,0.177659,-0.524924,-0.537093,0.366031,-0.136849,0.00094,0.010952,0.500553,0.660102,-0.52147,-0.244194,-0.486109,-0.363752,-0.236404,-0.242692,0.662258,0.010752,0.21357,-0.347338,-0.187356,0.096706,-0.519366,-0.103051
4,-0.492923,0.459704,0.022931,0.244422,0.170331,-0.150311,-0.92345,-0.569713,0.514869,0.17059,-0.170285,-0.445251,-0.482783,-0.533469,-0.280435,-1.445678,0.824776,-0.440895,0.157003,-0.297945,-1.251672,-0.828825,-0.062458,0.188626,0.565109,-0.469665,0.604914,-0.42041,-1.381116,-0.018543,2.043709,-1.302462,-0.120759,1.380389,0.105632,-0.82928,0.600147,-0.338085,-0.034165,-0.343144,...,0.318931,-0.446132,-0.524367,-1.214723,0.320162,0.139994,-0.815456,0.162476,-0.372392,-0.209769,-0.570395,-0.403055,-0.010193,-0.075058,-0.85115,-0.437811,-0.402185,-0.335056,-0.404157,-0.605382,0.68424,-0.435751,0.037887,-0.277067,0.456185,1.172824,-0.060598,-0.594914,0.680765,0.614265,-0.15196,-1.031606,0.736004,-0.188195,0.685506,-0.010319,0.715592,0.04894,-0.956379,-0.010929


In [287]:
data_total = pd.concat([data, df_w2v], axis=1)

In [349]:
x = data_total.filter(axis=1 , regex="[F+W].*")

y = data['gender'].copy()
y[y == 'f'] = 0
y[y == 'm'] = 1
y=y.astype('int')

rs = 4

In [350]:
x , y =  shuffle(x, y , random_state=rs)

In [351]:
x.tail()

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28,F29,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39,F40,...,W261,W262,W263,W264,W265,W266,W267,W268,W269,W270,W271,W272,W273,W274,W275,W276,W277,W278,W279,W280,W281,W282,W283,W284,W285,W286,W287,W288,W289,W290,W291,W292,W293,W294,W295,W296,W297,W298,W299,W300
104,807,0.760843,0.017348,0.206939,0.0,0.009913,155,4.019355,0.716129,0.36129,0.445161,0.574194,0.064516,67.429761,0.006787,0.09009,1105.121902,6.558328,7,34,28,30,29,7,13,3,2,0,1,0,1,0,0,0,0,0,0,0,15,4,...,-0.376958,0.090222,0.124852,-0.682958,0.007231,-0.244811,-0.256093,-0.012297,-0.272035,0.290415,-0.228289,-0.28499,-0.062925,-0.136974,-0.450313,-0.21373,-0.080371,-0.304961,-0.161594,-0.629414,0.351115,-0.093002,0.029581,-0.434856,0.349881,-0.103383,-0.332307,-0.274738,0.704022,-0.337876,0.212327,0.277471,0.294452,0.388835,0.226836,-0.351218,0.111924,-0.256351,-0.27832,0.08365
1,343,0.766764,0.017493,0.201166,0.0,0.011662,65,4.138462,0.830769,0.369231,0.461538,0.692308,0.107692,61.538462,0.00625,0.12963,1087.748014,5.660679,2,15,13,11,6,9,4,3,2,0,0,0,0,0,0,0,0,0,0,0,8,3,...,0.268524,-0.533557,0.014364,-0.071199,-0.070335,-0.312687,-0.125489,-0.417047,-0.592849,0.564484,-0.364487,-0.61621,-0.193117,0.012343,-0.358035,-0.168552,-0.342364,0.491329,-0.055903,-0.436484,0.383252,-0.188031,0.124307,-0.132027,0.214775,0.324322,-0.268805,-0.460694,-0.370299,-0.059947,-0.309613,-0.251924,0.547093,0.184294,-0.08744,-0.008302,-0.434463,0.147714,-1.091628,-0.863928
69,302,0.748344,0.0,0.211921,0.0,0.023179,63,3.634921,0.809524,0.206349,0.460317,0.634921,0.15873,65.507685,0.006656,0.196078,834.239709,5.584345,2,14,13,21,6,5,1,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,...,-0.314943,0.530383,0.069995,-0.241774,-0.139943,-0.470253,-0.53683,-0.534518,0.03758,0.683295,0.333539,-0.135495,-0.238788,0.01398,0.384504,-0.705882,0.111377,-0.26997,-0.407991,-0.221562,0.399054,-0.050086,0.374458,0.361372,0.911048,1.230061,-0.948099,-0.988818,-0.342013,0.220958,0.080736,-0.204889,0.9633,-0.470998,0.755604,-0.639073,-0.314996,-0.110358,-0.668109,-1.275531
55,575,0.78087,0.0,0.208696,0.0,0.010435,119,3.781513,0.731092,0.310924,0.462185,0.579832,0.10084,83.327449,0.008403,0.137931,1003.181031,6.220542,2,33,20,27,17,15,3,0,2,0,0,0,0,0,0,0,0,0,0,0,8,6,...,0.137254,0.041775,-0.18454,-0.67355,0.379798,-0.168352,-0.70102,-0.320208,-0.362762,0.604027,-0.704762,-0.163518,-0.226704,0.165105,-0.092945,-0.395871,-0.014579,-0.11494,-0.217931,-0.70948,-0.341857,-0.213233,0.590961,-0.102868,0.64525,0.703716,-1.358869,-0.791847,0.045537,-0.557507,-0.103739,0.393298,0.431076,-0.21521,0.758743,-0.592974,-0.107173,0.414839,-0.486004,-1.454245
46,492,0.739837,0.020325,0.207317,0.0,0.020325,96,3.885417,0.760417,0.3125,0.479167,0.604167,0.114583,78.125,0.007895,0.150685,964.705333,6.018665,6,22,18,20,14,6,4,3,1,1,0,1,0,0,0,0,0,0,0,0,11,0,...,-0.279456,0.010163,0.379146,-0.212864,0.255334,-0.49418,-0.65057,0.211663,-0.34026,0.616964,-0.037529,-0.096083,-0.094481,-0.367888,-0.174567,-0.081194,-0.279201,-0.03791,-0.228031,-0.210803,0.070161,-0.371992,-0.040287,-0.344021,0.712502,0.155793,-0.225278,-0.27751,0.501405,-0.059242,0.393803,-0.032716,0.737094,0.283668,0.32834,0.301342,-0.38292,-0.037621,-0.269879,-0.615881


In [352]:
with open(config['current_working_dir'] + config['model_name'], "rb") as f:
    while True:
        try:
            min_max_scaler = pickle.load(f)
            variance_threshold = pickle.load(f)
            clf = pickle.load(f)
        except EOFError:
            break

In [353]:
#transform all feature's values to range [0 1]
x = pd.DataFrame(min_max_scaler.transform(x))

In [356]:
x.fillna(0 , inplace=True)

In [358]:
x = pd.DataFrame(variance_threshold.transform(x))

In [359]:
x.fillna(0 , inplace=True)

In [360]:
y_pred = clf.predict(x)
acc = accuracy_score(y_pred , y)

In [361]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [362]:
confusion_matrix(y, y_pred)

array([[40, 10],
       [ 6, 54]])

In [None]:
 plot_confusion_matrix(clf, x, y,display_labels=['female' , 'male'],cmap=plt.cm.Blues)