In [1]:
import pandas as pd
from collections import Counter

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [5]:
%pwd
%cd ./drive/MyDrive/files/

/content/drive/MyDrive/files


# Read file

In [6]:
df = pd.read_csv("data-train.csv")

# Check the format

In [7]:
df.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


# Traditional dict freq counter cell
- this cell is responsible for formatting and creating the traditional freq counter
- use this cell to edit the text before it goes into sorting

In [31]:
r"""
  Try the classic frequency counter approach with dicts.
  The time reduced drastically when adding the replace methods / creating consistent formatting
"""
frequency_dict = {}
for rownum, row in df.iterrows():
  e_id = row["essay_id"]
  frequency_dict[e_id] = {}
  # iterates over every word in each essay but lowercase to avoid counting There and there as two different words
  for word in row["full_text"].lower().replace('.','').replace(',','').replace("'", "").split():
    if word in frequency_dict[e_id]:
      frequency_dict[e_id][word] += 1
    else:
      frequency_dict[e_id][word] = 1

# Counter from collections experiment
- is Counter going to be faster? Surely is
- Previously just using dict method took ~5s + additional time sort
- Counter returns sorted freq already, reduces time to count freq, and has additional helper methods that might come in hand (most_common)

In [29]:
from collections import Counter
# Counter reduced time by another 2s and also reduces the need to run a sort
frequency_dict_counter_method = {}
for rownum, row in df.iterrows():
  e_id = row["essay_id"]
  frequency_dict_counter_method[e_id] = {}
  # iterates over every word in each essay but lowercase to avoid counting There and there as two different words
  frequency_dict_counter_method[e_id] = Counter(row["full_text"].lower().replace('.','').replace(',','').replace("'", "").split())


In [30]:
print(len(frequency_dict_counter_method))
print(frequency_dict_counter_method[df["essay_id"].iloc[2]])

17307
Counter({'the': 28, 'of': 18, 'to': 18, 'car': 13, 'not': 12, 'that': 11, 'a': 11, 'they': 10, 'new': 10, 'technology': 9, 'is': 9, 'cars': 9, 'this': 9, 'be': 9, 'in': 8, 'these': 8, 'and': 8, 'are': 8, 'have': 6, 'or': 6, 'many': 6, 'will': 6, 'just': 6, 'even': 6, 'may': 5, 'idea': 5, 'driving': 5, 'can': 5, 'if': 5, 'seems': 4, 'for': 4, 'get': 4, 'as': 4, 'it': 4, 'there': 4, 'people': 3, 'nobody': 3, 'manufacturers': 3, '"driverless"': 3, 'eventually': 3, 'driverless': 3, 'someone': 3, 'options': 3, '"fun"': 3, 'who': 3, 'into': 3, 'an': 3, 'accident': 3, 'since': 3, 'could': 3, 'however': 2, 'around': 2, 'now': 2, 'on': 2, 'up': 2, 'with': 2, 'has': 2, 'very': 2, 'way': 2, 'go': 2, 'completely': 2, 'need': 2, 'when': 2, 'any': 2, 'certain': 2, 'become': 2, 'like': 2, 'money': 2, 'want': 2, 'drive': 2, 'their': 2, 'so': 2, 'trying': 2, 'more': 2, 'meant': 2, 'also': 2, 'raises': 2, 'questions': 2, 'about': 2, 'one': 2, 'states': 2, 'do': 2, 'laws': 2, 'system': 2, 'them': 2

# Sort Cell
- this cell is used to sort the frequency results
- run this cell after making adjustments to formatting in the cell above

In [25]:
r"""
Sort Cell
"""
sorted_by_frequency_dict = {}
# Sort in reverse so highest freq words appear first
# expect these to be most common words like "the", "of", "a", etc...
for k,v in frequency_dict.items():
  sorted_by_frequency_dict[k] = {l:m for l,m in sorted(v.items(), key=lambda item: item[1], reverse=True)}


In [15]:
print(len(frequency_dict))
print(len(df["essay_id"]))

17307
17307


In [17]:
df.head()

{'000d118': {'Many': 1,
  'live.': 1,
  'know': 1,
  'when': 1,
  'happen\xa0like': 1,
  'get': 1,
  'accidet': 1,
  'or\xa0the': 1,
  'smoke': 1,
  'bad': 1,
  'breath\xa0on': 1,
  'someone': 1,
  'VAUBAN,Germany': 1,
  'dont': 1,
  'proble': 1,
  '70': 1,
  "vauban's": 1,
  'families': 1,
  'own': 1,
  '57': 1,
  'sold': 1,
  'move': 1,
  'there.': 1,
  'Street': 1,
  'parkig': 1,
  ',driveways': 1,
  'home': 1,
  'forbidden\xa0on': 1,
  'outskirts': 1,
  'freiburd': 1,
  'near': 1,
  'French': 1,
  'Swiss': 1,
  'borders.': 1,
  'You': 1,
  'probaly': 1,
  "won't": 1,
  "Vauban's": 1,
  'streets': 1,
  'completely': 1,
  'free"': 1,
  'but\xa0If': 1,
  'lives': 1,
  'owns': 1,
  'ownership': 1,
  'allowed,but': 1,
  'two': 1,
  'places': 1,
  'park': 1,
  'large': 1,
  'at': 1,
  'edge': 1,
  'development,where': 1,
  'owner': 1,
  'buys': 1,
  'cheap': 1,
  'buy': 1,
  'one': 1,
  'sell': 1,
  '$40,000': 1,
  'along': 1,
  'home.': 1,
  'vauban': 1,
  'completed': 1,
  '2006': 1,
 

In [18]:
df.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [23]:
print(len(sorted_by_frequency_dict))
print(sorted_by_frequency_dict[df["essay_id"].iloc[2]])

17307
{'the': 26, 'of': 18, 'to': 18, 'not': 12, 'that': 11, 'a': 11, 'they': 10, 'car': 10, 'technology': 9, 'is': 9, 'be': 9, '': 9, 'in': 8, 'new': 8, 'and': 8, 'are': 8, 'this': 7, 'or': 6, 'will': 6, 'just': 6, 'even': 6, 'have': 5, 'may': 5, 'cars': 5, 'idea': 5, 'can': 5, 'seems': 4, 'for': 4, 'get': 4, 'as': 4, 'it': 4, 'driving': 4, 'people': 3, 'nobody': 3, 'many': 3, 'manufacturers': 3, 'eventually': 3, 'driverless': 3, 'someone': 3, 'options': 3, 'who': 3, 'into': 3, 'an': 3, 'since': 3, 'could': 3, 'these': 2, 'around': 2, 'on': 2, 'up': 2, 'with': 2, 'has': 2, 'very': 2, 'way': 2, 'completely': 2, '"driverless".': 2, 'need': 2, 'when': 2, 'any': 2, 'car.': 2, 'certain': 2, 'become': 2, 'like': 2, 'money': 2, 'if': 2, 'want': 2, 'drive': 2, 'their': 2, 'so': 2, 'trying': 2, 'more': 2, 'meant': 2, '"fun"': 2, 'also': 2, 'raises': 2, 'questions': 2, 'about': 2, 'accident': 2, 'one': 2, 'cars.': 2, 'states': 2, 'do': 2, 'new,': 2, "car's": 2, 'always': 1, 'wish': 1, 'had': 1,