# Descriptive statistics

The following code describes the dataset used in the sentiment analysis

In [1]:
import pandas as pd
import pysentiment2 as ps
from google.colab import drive 
import numpy as np
pd.set_option('display.expand_frame_repr', False)

drive.mount("/content/gdrive")


data = pd.read_csv("gdrive/My Drive/Thesis/processed data/data_processed.csv")

#Prepare data for descriptive statistics
data["Date"] = pd.to_datetime(data["Date"], format = "%Y-%m-%d")
data.sort_values(by= "Date", inplace = True)
data.drop_duplicates(inplace = True)

#Function to split initial dataframe into dataframes grouped by year
def split_years(dt):
    dt['year'] = dt['Date'].dt.year
    return [dt[dt['year'] == y] for y in dt['year'].unique()]

data_splt_years = split_years(data)
data_fill = []

for df_year_splt in data_splt_years:
    year = df_year_splt["Date"].iloc[0].year
    obs_count = len(df_year_splt)
    earliest_obs = df_year_splt["Date"].iloc[0]
    latest_obs = df_year_splt["Date"].iloc[-1]
    mean_word_count = df_year_splt["Word_count"].mean()
    company_count = df_year_splt["Ticker"].nunique()

    data_fill.append([year, obs_count, earliest_obs, latest_obs, mean_word_count, company_count])

#Calculate the metrics for the whole dataset
obs_count = len(data)
earliest_obs = data["Date"].iloc[0]
latest_obs = data["Date"].iloc[-1]
mean_word_count = data["Word_count"].mean()
company_count = data["Ticker"].nunique()

data_fill.append(["All years", obs_count, earliest_obs, latest_obs, mean_word_count, company_count])

df_by_year = pd.DataFrame(data_fill ,columns = ["Year", "Observations", "Earliest Observation", "Latest Observation", "Mean Word Count", "Company Count"])

print(df_by_year)


Mounted at /content/gdrive
         Year  Observations Earliest Observation Latest Observation  Mean Word Count  Company Count
0        2009           112           2009-06-26         2009-12-31       785.258929             71
1        2010          2836           2010-01-01         2010-12-31       409.405853            893
2        2011          4331           2011-01-01         2011-12-30       429.110136           1039
3        2012         10462           2012-01-03         2012-12-31       311.993118           2379
4        2013         13480           2013-01-01         2013-12-31       259.331009           3210
5        2014         26715           2014-01-01         2014-12-31       275.459330           4188
6        2015         36496           2015-01-01         2015-12-31       319.269454           4973
7        2016         37777           2016-01-01         2016-12-31       364.352490           5029
8        2017         40828           2017-01-01         2017-12-31      

In the next step, we check the sentiment indices of the dataset given by the dictionary methods using the Harvard Psychsosocological Dictionary and the Loughran and McDonald Dictionary.

In [None]:
data_fill = []

hiv4 = ps.HIV4()
lm = ps.LM()

data_fill = []

for df_year_splt in data_splt_years:

    year = df_year_splt["Date"].iloc[0].year

    hiv4_pos = []
    hiv4_neg = []
    hiv4_tone = []

    lm_pos = []
    lm_neg = []
    lm_tone = []

    for index, row in df_year_splt.iterrows():
        print(row["Ticker"])
        tokens_hiv4 = hiv4.tokenize(row["Text"])
        tokens_lm = lm.tokenize(row["Text"])

        score_hiv4 = hiv4.get_score(tokens_hiv4)
        score_lm = lm.get_score(tokens_lm)

        hiv4_pos.append(score_hiv4["Positive"])
        hiv4_neg.append(score_hiv4["Negative"])
        hiv4_tone.append(score_hiv4["Positive"] - score_hiv4["Negative"])

        lm_pos.append(score_lm["Positive"])
        lm_neg.append(score_lm["Negative"])
        lm_tone.append(score_lm["Positive"] - score_lm["Negative"])

    data_fill.append([year, sum(hiv4_pos), sum(hiv4_neg), sum(hiv4_tone), sum(lm_pos), sum(lm_neg), sum(lm_tone)])

df_by_year_dm = pd.DataFrame(data_fill ,columns = ["Year", "HIV4 positive", "HIV4 negative", "HIV4 tone", "LM_pos", "LM_neg", "LM_tone"])

print(df_by_year_dm)
