# 5. Feature Extraction 3 - bi-grams

#### This script extracts the n-grams of words in the POS and bleached csv files and also calculates the TF-IDF of it, so that the top few can be extracted and used as features. This is only done for the training set, so that the 'vocabulary' is then applied to the testing set as well.

### Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import defaultdict
import math
from IPython.display import clear_output

### Definitions

In [2]:
pos_filename = 'data/POS.csv'
bleached_filename = 'data/Bleached.csv'

pos_ngram_filename = 'data/POS-ngram-all.csv'
bleached_ngram_filename = 'data/Bleached-ngram-all.csv'

### Go trough POS file, get bi-grams and add to the dictionary

In [3]:
number_of_posts=0
chunksize=10
for chunk in pd.read_csv(pos_filename, usecols=['PostID'], chunksize=chunksize):
    number_of_posts = np.max(chunk.PostID)

number_of_chunks = math.ceil(number_of_posts / chunksize)

In [4]:
ngrams_dict = defaultdict( lambda:(0, 0) )

# open CSV file in append mode
with open(pos_ngram_filename, 'a') as f:
    for i,chunk in enumerate(pd.read_csv(pos_filename, chunksize=chunksize)):
        chunk = chunk.dropna()
        
        if i%30 == 0:
            clear_output()
            print('{0}/{1}'.format(i+1, number_of_chunks), end = '', flush=True)
        
        for row in chunk.iterrows():
            if row[1][3] == 'train':
                post = row[1][1]
                if len(list(nltk.bigrams(nltk.word_tokenize(post)))) > 0:
                    inverse_number_of_bigrams = 1/len(list(nltk.bigrams(nltk.word_tokenize(post))))
                    if row[1][2] == 'male':
                        for w1, w2 in nltk.bigrams(nltk.word_tokenize(post)):
                            ngrams_dict[(w1,w2)] = (
                                ngrams_dict[(w1,w2)][0] + inverse_number_of_bigrams,
                                ngrams_dict[(w1,w2)][1]
                            )
                    else:
                        for w1, w2 in nltk.bigrams(nltk.word_tokenize(post)):
                            ngrams_dict[(w1,w2)] = (
                                ngrams_dict[(w1,w2)][0],
                                ngrams_dict[(w1,w2)][1] + inverse_number_of_bigrams
                            )
clear_output()
print('DONE - {0} files'.format(number_of_posts))

DONE - 679782 files


### Save dictionary as new csv

In [5]:
df = pd.DataFrame(columns=['Word1', 'Word2', 'FrequencyMale', 'FrequencyFemale'])

for (Word1, Word2), (FrequencyMale, FrequencyFemale) in ngrams_dict.items():
    df = df.append({
        'Word1': Word1,
        'Word2': Word2,
        'FrequencyMale': FrequencyMale,
        'FrequencyFemale': FrequencyFemale
    }, ignore_index = True)

df.to_csv(pos_ngram_filename, index=False)

### Go trough Bleached file, get bi-grams and add to the dictionary

In [6]:
ngrams_dict = defaultdict( lambda:(0, 0) )

# open CSV file in append mode
with open(pos_ngram_filename, 'a') as f:
    for i,chunk in enumerate(pd.read_csv(bleached_filename, chunksize=chunksize)):
        chunk = chunk.dropna()
        
        if i%30 == 0:
            clear_output()
            print('{0}/{1}'.format(i+1, number_of_chunks), end = '', flush=True)
        
        for row in chunk.iterrows():
            if row[1][3] == 'train':
                post = row[1][1]
                if len(list(nltk.bigrams(nltk.word_tokenize(post)))) > 0:
                    inverse_number_of_bigrams = 1/len(list(nltk.bigrams(nltk.word_tokenize(post))))
                    if row[1][2] == 'male':
                        for w1, w2 in nltk.bigrams(nltk.word_tokenize(post)):
                            ngrams_dict[(w1,w2)] = (
                                ngrams_dict[(w1,w2)][0] + inverse_number_of_bigrams,
                                ngrams_dict[(w1,w2)][1]
                            )
                    else:
                        for w1, w2 in nltk.bigrams(nltk.word_tokenize(post)):
                            ngrams_dict[(w1,w2)] = (
                                ngrams_dict[(w1,w2)][0],
                                ngrams_dict[(w1,w2)][1] + inverse_number_of_bigrams
                            )
clear_output()
print('DONE - {0} files'.format(number_of_posts))

DONE - 679782 files


### Save dict as new csv

In [9]:
number_of_entries = len(ngrams_dict)

df = pd.DataFrame(columns=['Word1', 'Word2', 'FrequencyMale', 'FrequencyFemale'])
df.to_csv(bleached_ngram_filename, index=False)

with open(bleached_ngram_filename, 'a') as f:
    for i, ((Word1, Word2), (FrequencyMale, FrequencyFemale)) in enumerate(ngrams_dict.items()):

        if(i%30 == 0):
            clear_output()
            print('{0}/{1}'.format((i+1), number_of_entries), end = '', flush=True)

        df = df.append({
            'Word1': Word1,
            'Word2': Word2,
            'FrequencyMale': FrequencyMale,
            'FrequencyFemale': FrequencyFemale
        }, ignore_index = True)

        if df.shape[0] > 1000:
            df.to_csv(f, header=False, index=False)
            df = df.iloc[0:0]

    df.to_csv(f, header=False, index=False)

clear_output()
print('DONE - {0} files'.format(number_of_entries))

DONE - 2010166 files
