# Descriptive statistics

The following code describes the dataset used in the sentiment analysis

In [1]:
!pip install tabulate
!pip install pysentiment2

Collecting pysentiment2
  Downloading pysentiment2-0.1.1-py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 12.3 MB/s 
Installing collected packages: pysentiment2
Successfully installed pysentiment2-0.1.1


In [2]:
import pandas as pd
import pysentiment2 as ps
from google.colab import drive 
from tabulate import tabulate
pd.set_option('display.expand_frame_repr', False)

drive.mount("/content/gdrive")


data = pd.read_csv("gdrive/My Drive/Thesis/processed data/processdata_woSWandPS.csv")

#Prepare data for descriptive statistics
data["Date"] = pd.to_datetime(data["Date"], format = "%Y-%m-%d")


#Function to split initial dataframe into dataframes grouped by year
def split_years(dt):
    dt["Year"] = dt["Date"].dt.year
    return [dt[dt["Year"] == y] for y in dt["Year"].unique()]

data_splt_years = split_years(data)
data_fill = []

for df_year_splt in data_splt_years:
    year = df_year_splt["Date"].iloc[0].year
    obs_count = len(df_year_splt)
    mean_word_count = round(df_year_splt["Word Count"].mean(),0)
    company_count = df_year_splt["Ticker"].nunique()

    data_fill.append([year, obs_count, mean_word_count, company_count])

#Calculate the metrics for the whole dataset
obs_count = len(data)
mean_word_count = round(data["Word Count"].mean(),0)
company_count = data["Ticker"].nunique()

data_fill.append(["All years", obs_count, mean_word_count, company_count])

df_by_year = pd.DataFrame(data_fill ,columns = ["Year", "News Articles", "Mean Word Count", "Company Count"])

print(tabulate(df_by_year, headers = "keys", tablefmt = "psql"))

Mounted at /content/gdrive
+----+-----------+-----------------+-------------------+-----------------+
|    | Year      |   News Articles |   Mean Word Count |   Company Count |
|----+-----------+-----------------+-------------------+-----------------|
|  0 | 2009      |             111 |               783 |              70 |
|  1 | 2010      |            2791 |               404 |             865 |
|  2 | 2011      |            4277 |               426 |            1001 |
|  3 | 2012      |           10259 |               311 |            2279 |
|  4 | 2013      |           13143 |               257 |            3076 |
|  5 | 2014      |           26234 |               274 |            3997 |
|  6 | 2015      |           35457 |               319 |            4312 |
|  7 | 2016      |           36569 |               365 |            4261 |
|  8 | 2017      |           39629 |               477 |            4188 |
|  9 | 2018      |           77086 |               669 |            5195 

In [3]:
#Get top 120 companies with regards to news frequency (later only top 100, since some will be removed like SPX)

yearst = [2015, 2016, 2017, 2018, 2019]

data = data[data.Year.isin(yearst) == True]
unique_data_company =  data.groupby("Ticker").nunique()

unique_data_company.sort_values(by = "Text", ascending = False, inplace = True)

print(list(unique_data_company.index[0:120]))

['AAPL', 'AMZN', 'TSLA', 'FB', 'BA', 'NFLX', 'DIS', 'EFX', 'SPX', 'BAC', 'INTC', 'DAX', 'F', 'GLD', 'GE', 'GM', 'MSFT', 'SBUX', 'AIR', 'AAL', 'IBM', 'JPM', 'CMG', 'WFC', 'C', 'TWTR', 'WMT', 'MCD', 'AMD', 'NVDA', 'JNJ', 'GS', 'BABA', 'CAT', 'MU', 'CSCO', 'XOM', 'CVX', 'BP', 'GOOGL', 'USD', 'GPRO', 'COST', 'HD', 'QQQ', 'SQ', 'NKE', 'KO', 'AXP', 'TGT', 'ATVI', 'CMCSA', 'SNAP', 'DAL', 'LMT', 'T', 'ABBV', 'PFE', 'GILD', 'ADBE', 'CRM', 'VZ', 'AVGO', 'BX', 'LULU', 'BLK', 'UNH', 'FIT', 'KMI', 'BBY', 'PG', 'AGI', 'AA', 'AMAT', 'MRK', 'M', 'BIDU', 'QCOM', 'JCP', 'FDX', 'AMGN', 'BMY', 'ORCL', 'BHP', 'PYPL', 'MA', 'FRA', 'KR', 'SHOP', 'MO', 'GME', 'PM', 'VRX', 'CHK', 'ABX', 'MMM', 'BBBY', 'COP', 'UTX', 'IRBT', 'MS', 'SPY', 'FCX', 'HAL', 'AGN', 'HPQ', 'UAL', 'CELG', 'JWN', 'CVS', 'V', 'EA', 'STZ', 'GLW', 'ADP', 'APC', 'AZN', 'EBAY', 'ACN', 'PEP']
