In [5]:
import senticnet
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from os import listdir
from os.path import isfile, join

In [7]:
def extract_sentic_concepts_and_scores_csv(field, ticker):
    source_dir = '/home/andrea/Desktop/NLFF/intrinioDatasetUpdated/preprocessing/preprocessed/'
    dest_dir = 'conceptsDataset/title/'+str(ticker)+'.csv'
    tickFiles = [f for f in listdir(source_dir) if isfile(join(source_dir, f))]
    source_df = pd.read_csv(source_dir+ticker+'.csv')
    count =1
    while str(ticker)+str(count)+'.csv' in tickFiles:
        print(ticker+str(count))
        newsTemp = pd.read_csv(source_dir + ticker +str(count)+'.csv')
        source_df = pd.concat([source_df, newsTemp])
        count+=1
    source_df = source_df.rename(index=str, columns={"PUBLICATION_DATE": "DATE"})
    source_df.drop_duplicates(subset=['DATE'], inplace=True)
    source_df = source_df.sort_values(by=['DATE'])
    source_df = source_df.reset_index(drop=True)
    
    source_df['concepts'] = ''
    source_df['polarity'] = ''
    source_df['attention'] = ''
    source_df['pleasantness'] = ''
    source_df['aptitude'] = ''
    source_df['sensitivity'] = ''

    sn = senticnet.Senticnet()
    stemmer = PorterStemmer()

    counter = 0
    null = 0
    stemmed_null = 0
    for index, row in source_df.iterrows():
        concepts = []
        concept_scores = {'polarity' : 0, 'attention': 0, 'pleasantness': 0, 'aptitude': 0, 'sensitivity': 0}
        stemmed_concepts = []
        stemmed_concept_scores = {'polarity': 0, 'attention': 0, 'pleasantness': 0, 'aptitude': 0, 'sensitivity': 0}

        if isinstance(row[field], float):
            content = ''
            stemmed_title = ''
        else:
            content = '_' + row[field].replace(' ', '_').lower() + '_'
            stemmed_title = '_' + '_'.join([stemmer.stem(t) for t in row[field].lower().split()]) + '_'
        # print(stemmed_title)
        for concept_key in sn.data.keys():
            if '_' + concept_key +'_' in content:
                concepts.append(concept_key)
                concept_data = sn.concept(concept_key)
                concept_scores['polarity'] += concept_data['polarity']
                concept_scores['attention'] += concept_data['sentics']['attention']
                concept_scores['pleasantness'] += concept_data['sentics']['pleasantness']
                concept_scores['aptitude'] += concept_data['sentics']['aptitude']
                concept_scores['sensitivity'] += concept_data['sentics']['sensitivity']
            if '_' + concept_key +'_' in stemmed_title:
                stemmed_concepts.append(concept_key)
                concept_data = sn.concept(concept_key)
                stemmed_concept_scores['polarity'] += concept_data['polarity']
                stemmed_concept_scores['attention'] += concept_data['sentics']['attention']
                stemmed_concept_scores['pleasantness'] += concept_data['sentics']['pleasantness']
                stemmed_concept_scores['aptitude'] += concept_data['sentics']['aptitude']
                stemmed_concept_scores['sensitivity'] += concept_data['sentics']['sensitivity']

        # print(concept_scores)
        if len(concepts) > 0:
            for k, v in concept_scores.items():
                concept_scores[k] /= len(concepts)
        else:
            null +=1
        if len(stemmed_concepts) > 0:
            for k, v in stemmed_concept_scores.items():
                stemmed_concept_scores[k] /= len(stemmed_concepts)
            else:
                stemmed_null +=1

        # print(concept_scores)
        source_df.set_value(index, 'polarity', concept_scores['polarity'])
        source_df.set_value(index, 'attention', concept_scores['attention'])
        source_df.set_value(index, 'pleasantness', concept_scores['pleasantness'])
        source_df.set_value(index, 'aptitude', concept_scores['aptitude'])
        source_df.set_value(index, 'sensitivity', concept_scores['sensitivity'])
        source_df.set_value(index, 'concepts', ' '.join(concepts))
        source_df.set_value(index, 'stemmed_polarity', stemmed_concept_scores['polarity'])
        source_df.set_value(index, 'stemmed_attention', stemmed_concept_scores['attention'])
        source_df.set_value(index, 'stemmed_pleasantness', stemmed_concept_scores['pleasantness'])
        source_df.set_value(index, 'stemmed_aptitude', stemmed_concept_scores['aptitude'])
        source_df.set_value(index, 'stemmed_sensitivity', stemmed_concept_scores['sensitivity'])
        source_df.set_value(index, 'stemmed_concepts', ' '.join(stemmed_concepts))

        # joblib.dump(concept_scores, os.path.join(config.SENTIC_SCORES_DATA_DIR + file.replace('.txt', '.pkl')))
        counter += 1
        if(counter % 500 == 0):
            print(counter)
        # break
    print('Null: ',null,' Stemmed Null: ',stemmed_null )
    source_df.to_csv(dest_dir, index=False)

In [6]:
#tickers = ['AAPL','AMZN','GOOGL','MSFT','FB','INTC','CSCO','CMCSA','NVDA','NFLX']     
tickers=['PEP','BKNG','ADBE','AMGN','TXN','AVGO','PYPL','GILD','COST','QCOM'] 

In [8]:
for ticker in tickers:
    print(ticker)
    extract_sentic_concepts_and_scores_csv('TITLE', ticker)

PEP




500
1000
1500
2000
2500
3000
Null:  37  Stemmed Null:  3228
BKNG
500
1000
1500
2000
Null:  55  Stemmed Null:  2125
ADBE
500
1000
1500
2000
2500
Null:  33  Stemmed Null:  2771
AMGN
500
1000
1500
2000
2500
3000
3500
Null:  65  Stemmed Null:  3706
TXN
500
1000
1500
Null:  26  Stemmed Null:  1921
AVGO
500
1000
1500
2000
2500
3000
3500
4000
Null:  72  Stemmed Null:  4392
PYPL
500
1000
1500
2000
2500
3000
3500
4000
Null:  83  Stemmed Null:  3974
GILD
500
1000
1500
2000
2500
3000
3500
4000
Null:  76  Stemmed Null:  3998
COST
500
1000
1500
2000
2500
3000
3500
4000
Null:  42  Stemmed Null:  4321
QCOM
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
Null:  116  Stemmed Null:  8314
