# Import Python Packages

In [79]:
from collections import Counter

import daft
from daft import col

import numpy as np


from tqdm import tqdm

import matplotlib.pyplot as plt
plt.style.use('ggplot')

from pprint import pprint

print('All packages imported successfully!')

All packages imported successfully!


# Explore the Dataset

In [None]:
# Load the training dataset

train_df = daft.read_csv("./data/train.csv", infer_schema=True, delimiter=";")

In [None]:
# Number of rows
print(f"ROWS: {train_df.count_rows()}")

ROWS: 21000


In [None]:
# Number of columns

train_df.count(col("*")).show()

Score UInt64,Summary UInt64,Text UInt64
21000,21000,21000


In [None]:
# Display the schema
train_df.describe().show()

column_name Utf8,type Utf8
Score,Int64
Summary,Utf8
Text,Utf8


In [None]:
# Display the first 5 rows

train_df.collect().show(5)

Score Int64,Summary Utf8,Text Utf8
1,Why the white tea blend?,I really cannot emphasize how disappointed I am with Celestial Seasoning's having ruined its perfectly good green tea. The company has apparently totally eliminated production of plain green tea in favor of blends with white tea.
1,Horrible quality,The shipment I received had soooooo much powder it made the tea undrikable. Looks like all the great reviews are older.
4,The Jalapeno Story,I love the taste of jalapenos but not if they're too hot. My grandfather used to eat jalapeno sandwiches with white bread.
5,Addictive!,These sunflower seeds are incredible. They are not as hot as one would assume.
5,Brings back memories of my Mother's lemon pies,My sister and I were discussing how delicious the pies (especially lemon) our Mother baked when we were young. We remembered that she always used My-T-Fine fillings.


In [None]:
# Display the coluumn statistics

train_df.summarize().show()

column Utf8,type Utf8,min Utf8,max Utf8,count UInt64,count_nulls UInt64,approx_count_distinct UInt64
Score,Int64,1.0,5,21000,0,5
Summary,Utf8,,"~sugar free yes, low carb no~",21000,0,17714
Text,Utf8,,"yummy, but not really much cheaper then walmart. my 7 month old LOVES them and will eat them all day long, perfect because they disolve so quickly he won't choke.",21000,0,20276


In [42]:
train_df.groupby("Score").count().sort("Score", desc=True).show()

Score Int64,Summary UInt64,Text UInt64
5,13126,13126
4,3143,3143
3,1593,1593
2,1177,1177
1,1961,1961


In [75]:
dataset = train_df.with_column("num_tokens_text", train_df["Text"].str.split(" ").list.length()).collect()

In [76]:
dataset.show()

Score Int64,Summary Utf8,Text Utf8,num_tokens_text UInt64
4,"Smell bad, but dogs don't mind",This is a great product. My dogs love chewing them and they last for a long time.,18
5,needed more,I had bought some of this locally and liked it so much that i had to lookk for a better price. I have found the better price here on amazon.,31
1,a distant second,"this is nowhere near the quality and taste of Splenda's french vanilla ""flavors for coffee"". <a href=""http://www.",17
1,Bait & Switch,**Vine Voices will receive a total of 3 Lickety Stiks in separate flavors,13
5,Edible Silver Leaf for Cakes/Candies,"Great product at a good price. It used to be possible to buy edible silver and gold leaf (vark) at Indian groceries, but there seem to be problems nowadays importing it.",32
5,Yummy!,I had to make cornbread for my son's cubscouts banquet and we are now gluten free. I am not fond of cornbread usually but my son LOVES it.,28
1,Always rated at the very bottom of unbiased reviews.,Please do some research yourself to find out that they are ALWAYS rated the worst food for your pet in unbiased reviews.<br /><br />The top ingredients for this product are crude meal and loads of grains.,36
5,My go-to coffee subsitute,"Dandelion root is reported to aid in reducing high blood pressure, which is why I drink it. I've pretty much given up coffee drinking and this is a tasty substitute, though one must acquire a taste for it.",39


In [77]:
dataset = dataset.with_column(
    "num_tokens_Summary", dataset["Summary"].str.split(" ").list.length()
).collect()


In [78]:
dataset.show()

Score Int64,Summary Utf8,Text Utf8,num_tokens_text UInt64,num_tokens_Summary UInt64
4,"Smell bad, but dogs don't mind",This is a great product. My dogs love chewing them and they last for a long time.,18,6
5,needed more,I had bought some of this locally and liked it so much that i had to lookk for a better price. I have found the better price here on amazon.,31,2
1,a distant second,"this is nowhere near the quality and taste of Splenda's french vanilla ""flavors for coffee"". <a href=""http://www.",17,3
1,Bait & Switch,**Vine Voices will receive a total of 3 Lickety Stiks in separate flavors,13,3
5,Edible Silver Leaf for Cakes/Candies,"Great product at a good price. It used to be possible to buy edible silver and gold leaf (vark) at Indian groceries, but there seem to be problems nowadays importing it.",32,5
5,Yummy!,I had to make cornbread for my son's cubscouts banquet and we are now gluten free. I am not fond of cornbread usually but my son LOVES it.,28,1
1,Always rated at the very bottom of unbiased reviews.,Please do some research yourself to find out that they are ALWAYS rated the worst food for your pet in unbiased reviews.<br /><br />The top ingredients for this product are crude meal and loads of grains.,36,9
5,My go-to coffee subsitute,"Dandelion root is reported to aid in reducing high blood pressure, which is why I drink it. I've pretty much given up coffee drinking and this is a tasty substitute, though one must acquire a taste for it.",39,4


In [None]:
from collections import Counter

# Convert to Pandas to extract values
num_tokens_list = dataset.to_pandas()["num_tokens_text"]

# Count occurrences
token_count_frequency = Counter(num_tokens_list)

# Sort dictionary
token_count_frequency = dict(sorted(token_count_frequency.items()))

# Convert back to Daft DataFrame
text_frequency = daft.from_pydict({"num_tokens_text": list(token_count_frequency.keys()), 
                                   "count": list(token_count_frequency.values())})


In [83]:
text_frequency.show(28)

num_tokens_text Int64,count Int64
1,131
2,101
3,42
4,77
5,96
...,...
24,742
25,694
26,696
27,689
