In [None]:
import csv
import sys
from scipy.sparse import csc_matrix, vstack
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from google.colab import drive 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

drive.mount("/content/gdrive")

data = pd.read_csv("gdrive/My Drive/Thesis/processed data/data_whole_wS.csv", index_col = False)

print(data.head(20))

data_onlytext = data.dropna()
data_onlytext["Date"] = pd.to_datetime(data_onlytext["Date"])
data_onlytext["Year"] = [x.year for x in data_onlytext["Date"]]
data_onlytext["Month"] = [x.month for x in data_onlytext["Date"]]
data_onlytext["ordered_month"] = [((x[1]["Year"]-2015)*12 + x[1]["Month"]) for x in data_onlytext.iterrows()]

#Create a sparse matrix with the dictionary returned by the function get_ngrams Frankel, Jennings and Lee (2021) (modified for own needs)

def sparse_mat(data):
    row1 = []
    col1 = []
    data1 = []

    #Iterate through dictionary from get_sparsematrix_and_car to create sparse matrix (value in interation are all ngram counts of observation 'key').

    for key, value in data.items():

        value_n = list(value.values())

        for e, elem in enumerate(value_n):
            colnum = e
            value = elem

            row1.append(key)
            col1.append(colnum)
            data1.append(value)

    X = csc_matrix((data1, (row1, col1)))  # Sparse matrix of rows (observations) and columns (independent variables)

    return X

#Function to get n_grams count of whole training data set
#This function will iterate over all text items in the training data set and return all unique one- and twograms within the training data

def get_ngrams(data):

    onegrams = []
    twograms = []

    for index, row in data.iterrows():

        sentences = row["Text"].split('.')

        #### EXTRACT ALL ONE AND TWO WORD PHRASES #### Frankel, Jennings and Lee (2021)

        for sentence in sentences:

            sentence = sentence.replace('.', '').strip()

            allwords = sentence.split(' ')

            for w, word in enumerate(allwords):
                word0 = allwords[w]
                try:
                    word1 = allwords[w + 1]
                except Exception:
                    word1 = ''

                if word0.strip() != '.' and word0.strip() != '':
                    onegrams.append(word0)

                    if word1.strip() != '.' and word1.strip() != '':
                        twogram = word0 + ' ' + word1
                        twograms.append(twogram)

    n_grams_dict = {}

    uniqueonegrams = list(set(onegrams))
    uniqueonegrams = sorted(uniqueonegrams)

    uniquetwograms = list(set(twograms))
    uniquetwograms = sorted(uniquetwograms)

    ngrams = uniqueonegrams + uniquetwograms

    return ngrams

#The function get_ngrams_eachtxt_sparsematrix takes the ngram_file with all one- and twograms from the function above
#and iterates again over each text item (either training or test data) to count occurence of the words from the ngram dictionary.
#This function returns a nested dictionary which represents for each txt item in dataset the training dataset ngram occurences

def get_sparsematrix_and_car(data_train, data_test):

    ngram_list = get_ngrams(data_train)

    wrd_list = ngram_list

    wrd_list = sorted(wrd_list)
    wrd_list = tuple(wrd_list)

    # Initialize dependent variable list (CAR)
    car = []

    # Initialize dictionary with as many keys as txt items in dataset
    wrd_dictionary = dict.fromkeys(range(50))

    i = 0
    j = 0

    for index, row in data_test.iterrows():

        car.append(row["CAR"])

        sentences = row["Text"].split('.')

        print(j)

        # Initialize dictionary within dictionary with keys according to all ngrams found in training dataset. Frankel, Jennings and Lee (2021) (modified for own needs)
        wrd_dictionary[i] = dict.fromkeys(wrd_list, 0)

        for sentence in sentences:

            sentence = sentence.replace('.', '').strip()
            allwords = sentence.split(' ')

            for w, word in enumerate(allwords):
                word0 = allwords[w]
                try:
                    word1 = allwords[w + 1]
                except Exception:
                    word1 = ''

                # Add count of found ngrams occurence to dictionary
                if word0.strip() != '.' and word0.strip() != '':
                    if word0 in wrd_dictionary[i].keys():
                        wrd_dictionary[i][word0] = wrd_dictionary[i][word0] + 1

                    if word1.strip() != '.' and word1.strip() != '':
                        if word0 + ' ' + word1 in wrd_dictionary[i].keys():
                            wrd_dictionary[i][word0 + ' ' + word1] = wrd_dictionary[i][word0 + ' ' + word1] + 1

        i += 1
        j += 1
        
        #i and j necessary due to RAM overload. i serves as marker to create a sparse matrix every 125 observations.
        #j serves as marker to concatenate the sparse matrices and to stop the for loop. In last iteration all keys of the word dictionary need to be deleted with 0 entries for sparse matrix function

        if j == len(data_test):
            keys_to_remove = (j % 50)
            for key in range(keys_to_remove, 50):
                del wrd_dictionary[key]
            spar_mat_i = sparse_mat(wrd_dictionary)
            spar_mat = vstack((spar_mat, spar_mat_i))
            break
        
        if i % 50 == 0:
            spar_mat_i = sparse_mat(wrd_dictionary)
            if j != 50:
              spar_mat = vstack((spar_mat, spar_mat_i))
              wrd_dictionary = dict.fromkeys(range(50))
              i = 0
            else:
              spar_mat = spar_mat_i
              wrd_dictionary = dict.fromkeys(range(50))
              i = 0

    return spar_mat, car

def split_months(dt):
    return [dt[dt["ordered_month"] == y] for y in dt["ordered_month"].unique()]

data_splt_months = split_months(data_onlytext)

i = 45

np.random.seed(9000)
for _, month in enumerate(data_splt_months):

        i += 1

        data_train = pd.concat([data_splt_months[i], data_splt_months[i+1], data_splt_months[i+2]])
        data_test = data_splt_months[i+3]

        print(len(data_train))
        print(len(data_test))

        y_predict = []

        if i+1 < len(data_splt_months):

            #Sparse matrices as input for ML models
            X_train, y_train = get_sparsematrix_and_car(data_train, data_train)
            X_test, y_test = get_sparsematrix_and_car(data_train, data_test)

            #Random Forest
            rf = RandomForestRegressor(n_estimators=1000, max_features='sqrt')
            rf = rf.fit(X_train, y_train)
            y_predict.append(rf.predict(X_test))

            data_splt_months[i+3]["CAR_RF"] = y_predict[0].tolist()

            with open("gdrive/My Drive/Thesis/processed data/RF_sentiment/" + str(i+1) + ".csv", "w") as csv_file:
                  
                  writer = csv.writer(csv_file)
                  writer.writerow(
                      ["Date", "Ticker", "Nasdaq", "Turnover", "Size", "BTM",
                      "pref_alpha", "CAR", "Text", "CAR_RF"])
                  for index, row in data_splt_months[i+3].iterrows():
                      writer.writerow([row["Date"], row["Ticker"], row["Nasdaq"], row["Turnover"], row["Size"], row["BTM"], row["pref_alpha"], row["CAR"], row["Text"], row["CAR_RF"]])
            

        


Mounted at /content/gdrive
          Date Ticker  Nasdaq  Turnover          Size       BTM  pref_alpha  \
0   2015-01-01   AAPL       1  1.336802  6.370024e+08  0.187370    0.001312   
1   2015-01-02   AAPL       1  1.336802  6.370024e+08  0.187370    0.001312   
2   2015-01-05   AAPL       1  1.344416  6.190077e+08  0.192817    0.001142   
3   2015-01-06   AAPL       1  1.347419  6.190077e+08  0.192817    0.000912   
4   2015-01-07   AAPL       1  1.345351  6.190660e+08  0.192799    0.000963   
5   2015-01-08   AAPL       1  1.351682  6.277467e+08  0.190132    0.000958   
6   2015-01-09   AAPL       1  1.350462  6.518661e+08  0.183097    0.000957   
7   2015-01-12   AAPL       1  1.371621  6.363537e+08  0.187561    0.000885   
8   2015-01-13   AAPL       1  1.376746  6.363537e+08  0.187561    0.000836   
9   2015-01-14   AAPL       1  1.384635  6.420037e+08  0.185910    0.000921   
10  2015-01-15   AAPL       1  1.392119  6.395573e+08  0.186621    0.000991   
11  2015-01-16   AAPL    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

2575
630
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


KeyboardInterrupt: ignored