### sim_py_textJEWK.ipynb

### Text mining: compare two books

# Packages

### Install packages

"codecs" is for reading the text files, 
"re" (regular expretions) and "collections" for working with tokens,
"nltk" (natural language toolkit)

In [2]:
!pip install pandas
!pip install numpy
!pip install scipy
!pip install scikit-learn
!pip install nltk
!pip install matplotlib



### Import packages

In [3]:
import codecs
import re
import copy
import collections

In [4]:
import numpy as np
import pandas as pd

In [5]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer

In [6]:
from __future__ import division


In [7]:
import matplotlib
%matplotlib inline

UsageError: Line magic function `%` not found.


# Download stopwords

### Some specialized functions from NLTK
You can also download everything in NLTK with nltk.download(), but it will take time!

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SvetlanaMeissner\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

Import the stopwords package from NLTK

In [9]:
from nltk.corpus import stopwords

# Data

### Read data for Windows

In [10]:
with codecs.open('C:/Users\SvetlanaMeissner/Documents/ddoc/07_Enterprice/DLEARN/Simulation-KL/PSs/PS7/fordata/JaneEyre.txt', "r", encoding="utf-8") as f:
    text_JE = f.read()
with codecs.open('C:/Users\SvetlanaMeissner/Documents/ddoc/07_Enterprice/DLEARN/Simulation-KL/PSs/PS7/fordata//WutheringHeights.txt', "r", encoding="utf-8") as f:
    text_WH = f.read()


# Process data
Check for stopwords

In [11]:
esw = stopwords.words('english')
esw.append("would")

Filter tokens (using regular expressions)

In [12]:
word_pattern = re.compile("^\w+$")

Create a token counter function

In [13]:
def get_text_counter(text):
    tokens = WordPunctTokenizer().tokenize(PorterStemmer().stem(text))
    tokens = list(map(lambda x: x.lower(), tokens))
    tokens = [token for token in tokens if re.match(word_pattern, token) and token not in esw]
    return collections.Counter(tokens), len(tokens)

Create a function to calculate the absolute frequency of the most commen words.

In [14]:
def make_df(counter, size):
    abs_freq = np.array([el[1] for el in counter])
    rel_freq = abs_freq / size
    index = [el[0] for el in counter]
    df = pd.DataFrame(data=np.array([abs_freq, rel_freq]).T, index=index, columns=["Absolute frequency", "Relative frequency"])
    df.index.name = "Most common words"
    return df

# Analysis

## Analyze individual texts

Calculate the most common words of Jane Eyre and display the 15 most common.

In [15]:
je_counter, je_size = get_text_counter(text_JE)


In [16]:
make_df(je_counter.most_common(15), je_size)

Unnamed: 0_level_0,Absolute frequency,Relative frequency
Most common words,Unnamed: 1_level_1,Unnamed: 2_level_1
one,593.0,0.00679
said,584.0,0.006687
mr,543.0,0.006218
could,504.0,0.005771
like,397.0,0.004546
rochester,366.0,0.004191
well,348.0,0.003985
little,341.0,0.003905
jane,341.0,0.003905
sir,315.0,0.003607


Save the 1000 most common words of Jane Eyre to .csv

In [17]:
je_df = make_df(je_counter.most_common(1000), je_size)
je_df.to_csv("JE2_1000.csv")

Calculate the most common words of Withering Hights and display the 15 most common.

In [18]:
wh_counter, wh_size = get_text_counter(text_WH)

In [19]:
make_df(wh_counter.most_common(15), wh_size)

Unnamed: 0_level_0,Absolute frequency,Relative frequency
Most common words,Unnamed: 1_level_1,Unnamed: 2_level_1
heathcliff,475.0,0.008735
linton,404.0,0.007429
catherine,379.0,0.00697
said,375.0,0.006896
mr,312.0,0.005738
one,290.0,0.005333
could,279.0,0.005131
master,205.0,0.00377
shall,191.0,0.003512
come,190.0,0.003494


Save the 1000 most common words of Withering Hights to .csv

In [20]:
wh_df = make_df(wh_counter.most_common(1000), wh_size)
wh_df.to_csv("WH2_1000.csv")

# Compare texts

Find the most common words across the two documents.

In [21]:
all_counter = wh_counter + je_counter

In [22]:
all_df = make_df(wh_counter.most_common(1000), 1)
most_common_words = all_df.index.values

Create a data frame with the differences in word frequency

In [23]:
df_data = []
for word in most_common_words:
    je_c = je_counter.get(word, 0) / je_size
    wh_c = wh_counter.get(word, 0) / wh_size
    d = abs(je_c - wh_c)
    df_data.append([je_c, wh_c, d])
    
    

In [24]:
diff_df = pd.DataFrame(data=df_data, index=most_common_words,
                          columns=["JE relative frequency", "WH relative frequency", "Differences in relative frequency"])
diff_df.index.name = "Most common words"
diff_df.sort_values("Differences in relative frequency", ascending=False, inplace=True)
    

Display the most 20 distinctive words.

In [25]:
diff_df.head(20)

Unnamed: 0_level_0,JE relative frequency,WH relative frequency,Differences in relative frequency
Most common words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
heathcliff,0.0,0.008735,0.008735
linton,0.0,0.007429,0.007429
catherine,1.1e-05,0.00697,0.006958
hareton,0.0,0.003292,0.003292
sir,0.003607,0.000791,0.002816
master,0.001134,0.00377,0.002636
joseph,0.0,0.002575,0.002575
earnshaw,0.0,0.002372,0.002372
cathy,0.0,0.00228,0.00228
edgar,0.0,0.002133,0.002133


Save the full list of distinctive words to a dist_JEWH.csv

In [26]:
diff_df.to_csv("dist_JEWH.csv")