# Data analysis

## Reading the dataframe

In [None]:
import pandas as pd

df = pd.read_csv("../main dataset/dataset.csv")
df.head()

## Checking None values

In [None]:
df.isna().sum()

## Preprocessing

### Tweet length analysis

In [None]:
import hazm

df['tweet_len_by_words'] = df['text'].apply(lambda t: len(hazm.word_tokenize(t)))
df['tweet_len_by_sents'] = df['text'].apply(lambda t: len(hazm.sent_tokenize(t)))
df.head()

In [None]:
def calFreq(samples):
    result = {}
    for sample in samples:
        if result.get(sample):
            result[sample] += 1
        else:
            result[sample] = 1
    return result

tweet_len_by_words_freq = calFreq(df["tweet_len_by_words"].to_list())
tweet_len_by_sents_freq = calFreq(df["tweet_len_by_sents"].to_list())

tweet_len_by_words_freq = dict(sorted(tweet_len_by_words_freq.items()))
tweet_len_by_sents_freq = dict(sorted(tweet_len_by_sents_freq.items()))

In [None]:
import matplotlib.pyplot as plt

plt.title("Tweet length by words")
plt.xlabel("Tweet length")
plt.ylabel("Tweet legth frquency")
plt.plot(tweet_len_by_words_freq.keys(), list(tweet_len_by_words_freq.values()));

In [None]:
import matplotlib.pyplot as plt

plt.title("Tweet length by sentences")
plt.xlabel("Tweet length")
plt.ylabel("Tweet legth frquency")
plt.plot(tweet_len_by_sents_freq.keys(), list(tweet_len_by_sents_freq.values()));

In [None]:
min_max_len = [
    df["tweet_len_by_words"].min(),
    df["tweet_len_by_words"].max(),
    df["tweet_len_by_sents"].min(),
    df["tweet_len_by_sents"].max()
]
print(f'Min length by word: {min_max_len[0]} \tMax length by word: {min_max_len[1]}')
print(f'Min length by sent: {min_max_len[2]} \tMax length by sent: {min_max_len[3]}')

In [None]:
def dataGreaterLessThan(data, less_than=100.0, greater_than=0.0, col='tweet_len_by_words'):
    data_length = data[col].values
    data_glt = sum([1 for length in data_length if greater_than < length <= less_than])
    data_glt_rate = (data_glt / len(data_length)) * 100
    print(f'Texts with word length of greater than {greater_than} and less than {less_than} includes {data_glt_rate:.2f}% of the whole!')

In [None]:
dataGreaterLessThan(df, 2, 0, 'tweet_len_by_sents')
dataGreaterLessThan(df, 3, 0, 'tweet_len_by_sents')
dataGreaterLessThan(df, 5, 0, 'tweet_len_by_sents')
dataGreaterLessThan(df, 93, 3, 'tweet_len_by_words')

## Idea based on these length results
We know that length are very important in the `Natural Language Processing`, as the models are highly dependant on the length of the sentences, especially `vectorzers` and `tokenizers`.

In the future, we can get the subset of data in a way that very low and very high sentences be removed.

## Checking target values

In [None]:
df.groupby('Sadness').sum()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

targetCols = ['Sadness', 'Wonder', 'Hatred', 'Happiness', 'Fear', 'Anger']

for targetCol in targetCols:
    opacity = 0.4
    bar_width = 0.35
    plt.xlabel(f'{targetCol} rate')
    plt.ylabel('Frequency')
    plt.title(f'Distribution of {targetCol} rate within tweets')
    groupby_rate = df.groupby(targetCol)
    df[targetCol].value_counts().sort_values().plot(kind = 'barh')
    plt.show()