In [1]:
import pandas as pd
import pymongo
import json
from collections import defaultdict
from pprint import pprint

In [2]:
jaccard_similarity = pd.read_csv("/user/projects/project-3-techChanakya/data/jaccard_similarity.csv")
betweeness = pd.read_csv("/user/projects/project-3-techChanakya/data/betweeness_results_full.csv")
pagerank = pd.read_csv("/user/projects/project-3-techChanakya/data/pagerank_results_full.csv")
pearson_high = pd.read_csv("/user/projects/project-3-techChanakya/data/pearson_high_correlation_0.8_or_more.csv")
pearson_low = pd.read_csv("/user/projects/project-3-techChanakya/data/pearson_high_negative_correlation_0.8_or_more.csv")
louvain = pd.read_csv("/user/projects/project-3-techChanakya/data/louvain_communities_full.csv")

In [3]:
# Create stocks with high and low similarity based on Jaccard Similarity scores
similar_threshold = 0.8
dissimilar_threshold = 0.5

# Take a subset of jaccard data frame
tickers = jaccard_similarity[['stock1', 'stock2', 'similarity']]

similar_dict = defaultdict(set)
dissimilar_dict = defaultdict(set)

for _, row in tickers.iterrows():
    s1, s2, sim = row['stock1'], row['stock2'], row['similarity']
    if sim >= similar_threshold:
        similar_dict[s1].add(s2)
        similar_dict[s2].add(s1)
    elif sim <= dissimilar_threshold:
        dissimilar_dict[s1].add(s2)
        dissimilar_dict[s2].add(s1)

# Get all unique stock tickers
all_stocks = set(tickers['stock1']).union(tickers['stock2'])
    
result_df = pd.DataFrame({
    'ticker': list(all_stocks),
    'jaccard_similar': [list(similar_dict[stock]) for stock in all_stocks],
    'jaccard_dissimilar': [list(dissimilar_dict[stock]) for stock in all_stocks]
})

In [4]:
# Create stocks with high and low similarity based on Pearson correlation scores
# Combine both directions of the stock relationships
combined = pd.concat([
    pearson_high[['Stock1', 'Stock2']],
    pearson_high.rename(columns={'Stock1': 'Stock2', 'Stock2': 'Stock1'})[['Stock1', 'Stock2']]
])

# Group by stock1 and collect all associated stock2s
grouped = combined.groupby('Stock1')['Stock2'].apply(lambda x: sorted(set(x))).reset_index()

# Rename for clarity
grouped.columns = ['ticker', 'pearson_similar']
    

In [5]:
# Combine both directions of the stock relationships
combined = pd.concat([
    pearson_low[['Stock1', 'Stock2']],
    pearson_low.rename(columns={'Stock1': 'Stock2', 'Stock2': 'Stock1'})[['Stock1', 'Stock2']]
])

# Group by stock1 and collect all associated stock2s
grouped_low = combined.groupby('Stock1')['Stock2'].apply(lambda x: sorted(set(x))).reset_index()

# Rename for clarity
grouped_low.columns = ['ticker', 'pearson_dissimilar']

In [7]:
# Merge all results into a single dataframe

result_df = pd.merge(result_df, betweeness, on='ticker', how='inner')

result_df = pd.merge(result_df, pagerank, on='ticker', how='inner')

louvain_new = louvain[['ticker','community']]
louvain_new.rename(columns={'community':'louvain_community'}, inplace=True)
result_df = pd.merge(result_df, louvain_new, on='ticker', how='inner')

result_df = pd.merge(result_df, grouped, on='ticker', how='left')
result_df = pd.merge(result_df, grouped_low, on='ticker', how='left')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [8]:
result_df

Unnamed: 0,ticker,jaccard_similar,jaccard_dissimilar,betweenness_score,pagerank_score,louvain_community,pearson_similar,pearson_dissimilar
0,SGEN,"[ANSS, CDW, AVGO, CDNS, CHTR, ALGN, ASML, ADSK...",[],0.0,0.150000,82,,
1,FOXA,"[TMUS, COST, MAR, AMZN, EA, FAST, XEL, AMGN, AEP]",[ANSS],0.0,0.185125,77,[FOX],
2,WBA,"[CSCO, CSX, AMAT, CMCSA, ATVI, ADI, CTSH, BIDU...",[],4290.0,0.484736,77,"[FOX, TCOM]","[AAPL, CPRT, CTAS, DOCU, IDXX, LULU, ORLY, PAY..."
3,VRSN,"[ANSS, CDW, AVGO, CDNS, CHTR, ALGN, ASML, ADSK...",[],0.0,0.150000,93,,
4,CPRT,"[ANSS, CDW, AVGO, CDNS, ASML, ALGN, CHTR, ADSK...",[],796.0,1.423022,77,"[AAPL, ADBE, ADP, ALGN, AMD, ANSS, ASML, AVGO,...","[FOX, KHC, PCAR, PDD, TCOM, WBA]"
...,...,...,...,...,...,...,...,...
97,CTSH,"[CSCO, WBA, PYPL, FISV, SBUX, CMCSA, KDP, MU, ...","[AAPL, INTC, NVDA, MSFT, AMD]",0.0,0.150000,28,,
98,VRSK,"[ANSS, CDW, AVGO, CDNS, CHTR, ALGN, ASML, ADSK...",[],526.0,1.148320,67,"[AAPL, ADBE, AEP, ALGN, AMD, ANSS, ASML, CDNS,...","[ATVI, INCY, KHC]"
99,EA,"[TMUS, FOXA, COST, MAR, AMZN, FAST, XEL, AMGN,...",[ANSS],0.0,0.150000,32,,
100,CRWD,"[CSCO, WBA, PYPL, FISV, SBUX, CMCSA, KDP, MU, ...","[MSFT, INTC, NVDA, AAPL]",39.0,0.899556,29,"[AAPL, ADBE, AVGO, CPRT, CTAS, DOCU, DXCM, EBA...","[DLTR, KHC]"


In [9]:
# Insert Data into Mongo

# Connect to Mongo db
mongo = pymongo.MongoClient("mongodb://mongo:27017/")

db = mongo["stocks"] # Database name

# Drop collection if exists
db.drop_collection('ticker_groups')

# Create collection
collection = db['ticker_groups'] # My collection name

# Covert DataFrame to Mongo Documents
data = result_df.to_dict(orient='records')

# Insert into MongoDB
collection.insert_many(data)

<pymongo.results.InsertManyResult at 0x7f3a46831b40>

In [12]:
# Retrieve a Mongo Document
ticker_to_find = 'AAPL'
document = collection.find_one({'ticker': ticker_to_find})

pprint(document)

{'_id': ObjectId('67fd9819b07b767971278d25'),
 'betweenness_score': 66.0,
 'jaccard_dissimilar': ['CSCO', 'CSX', 'ATVI', 'CTSH', 'BIDU', 'CRWD'],
 'jaccard_similar': ['MSFT', 'INTC', 'NVDA', 'AMD'],
 'louvain_community': 77,
 'pagerank_score': 1.4397808463255954,
 'pearson_dissimilar': ['ATVI', 'FOX', 'KHC', 'PCAR', 'PDD', 'TCOM', 'WBA'],
 'pearson_similar': ['ADBE',
                     'ADP',
                     'ALGN',
                     'AMD',
                     'ANSS',
                     'ASML',
                     'AVGO',
                     'CDNS',
                     'CDW',
                     'CHTR',
                     'CMCSA',
                     'COST',
                     'CPRT',
                     'CRWD',
                     'CTAS',
                     'DOCU',
                     'DXCM',
                     'EBAY',
                     'FB',
                     'GOOG',
                     'GOOGL',
                     'IDXX',
                     'IN