In [31]:
import pandas as pd
import s3fs
from sklearn.preprocessing import MinMaxScaler

year = "2023"
month = "05"
day = "17"

s3 = s3fs.S3FileSystem(anon=False)

files_location = (
    f"s3://social-signals-dev-data/reddit/year={year}/month={month}/day={day}/combined.csv"
)
files = s3.glob(files_location)
print(len(files))
print(f"Processing files {files}")

dfs = [pd.read_csv(f"s3://{file}") for file in files]
df = pd.concat(dfs, ignore_index=True)
print(f"Shape of the final dataframe is {df.shape}")

print("Calculating submission rank")
df["submission_rank"] = df["submission_score"] / df["subreddit_subscribers"]

print("Calculating comment rank")
df["comment_rank"] = df["submission_num_comments"] / df["submission_score"]

#print(df.head())
#df.dropna(inplace=True)
#df = df.reset_index()
#df1 = df[df.isna().any(axis=1)]
#print(df1.head())

print("Performing Min-Max scaling")
scaler = MinMaxScaler()
df[["submission_rank", "comment_rank"]] = scaler.fit_transform(
    df[["submission_rank", "comment_rank"]]
)

print("Calculating Social Signals rank")
df["social_signals_rank"] = (0.35 * df["submission_rank"]) + (0.65 * df["comment_rank"])

df = df.sort_values(by=['social_signals_rank'], ascending=False)
df.to_csv("~/Desktop/ss.csv")
df.head()

7
Processing files ['social-signals-dev-data/reddit/year=2023/month=05/day=17/entertainment.csv', 'social-signals-dev-data/reddit/year=2023/month=05/day=17/movies.csv', 'social-signals-dev-data/reddit/year=2023/month=05/day=17/news.csv', 'social-signals-dev-data/reddit/year=2023/month=05/day=17/politics.csv', 'social-signals-dev-data/reddit/year=2023/month=05/day=17/sports.csv', 'social-signals-dev-data/reddit/year=2023/month=05/day=17/technology.csv', 'social-signals-dev-data/reddit/year=2023/month=05/day=17/television.csv']
Shape of the final dataframe is (170, 14)
Calculating submission rank
Calculating comment rank
Performing Min-Max scaling
Calculating Social Signals rank


Unnamed: 0,subreddit_name,subreddit_subscribers,submission_id,submission_url,submission_title,submission_score,organization,person,location,title_emotion_prediction,title_esg_categories_prediction,submission_num_comments,comments,comments_emotion_counter,submission_rank,comment_rank,social_signals_rank
168,television,16867810,13jf7jb,https://www.reddit.com/r/television/comments/1...,TV Shows you don't really like but you continu...,9,,,,neutral,Non-ESG,44,"Wow, don’t do that to yourself. Why waste time...","{""disgust"": 1, ""neutral"": 6, ""surprise"": 1}",5.9e-05,1.0,0.650021
75,politics,8324008,13jd3tf,https://www.axios.com/2023/05/16/george-santos...,Democrats trigger vote on expelling George San...,55009,Congress,George Santos,,neutral,Community Relations,1648,"\nAs a reminder, this subreddit [is for civil ...","{""disgust"": 1, ""neutral"": 1}",1.0,0.006128,0.353983
44,movies,30927309,13jai48,https://moviesdusk.com/fast-x-director-initial...,Fast X Director Initially Worried He Would Kil...,34,,,,fear,Corporate Governance,60,Stopping the gravy train? ?? Not if Vin Diesel...,"{""anger"": 1, ""disgust"": 1, ""fear"": 1, ""neutral...",0.000144,0.360963,0.234676
23,entertainment,3878059,13j6hu4,https://www.theguardian.com/uk-news/2023/may/1...,Rapper Slowthai appears in court charged with ...,4,,Slowthai,,neutral,Non-ESG,7,Why would you spend massive amounts of money t...,{},0.000134,0.357955,0.232717
0,entertainment,3878059,13j8l4n,https://www.independent.co.uk/arts-entertainme...,Arnold Schwarzenegger criticises recent Termin...,15152,,Arnold Schwarzenegger,,neutral,Business Ethics & Values,1139,Genysis ignored Rise Of The Machines and Salva...,"{""disgust"": 1, ""neutral"": 1}",0.591218,0.015376,0.216921


In [1]:
import pandas as pd

year = "2023"
month = "05"
day = "17"

In [2]:
input_path = f"s3://social-signals-dev-data/reddit/year={year}/month={month}/day={day}/combined.csv"
df = pd.read_csv(input_path)
 
df_organization = df[df["organization"] != NONE_FILLER]
df_person = df[df["person"] != NONE_FILLER]
df_location = df[df["location"] != NONE_FILLER]


NameError: name 'NONE_FILLER' is not defined

In [40]:
#import the libraries
import pandas as pd                        
from pytrends.request import TrendReq
pytrend = TrendReq()
#provide your search terms
kw_list=['George Santos']
pytrend.build_payload(kw_list=kw_list)
#get related queries
related_queries = pytrend.related_queries()
related_queries.values()


#build lists dataframes

top = list(related_queries.values())[0]['top']
print(top)
rising = list(related_queries.values())[0]['rising']
print(rising)

#convert lists to dataframes

dftop = pd.DataFrame(top)
dfrising = pd.DataFrame(rising)

#join two data frames
joindfs = [dftop, dfrising]
allqueries = pd.concat(joindfs, axis=1)

#function to change duplicates

cols=pd.Series(allqueries.columns)
for dup in allqueries.columns[allqueries.columns.duplicated(keep=False)]: 
    cols[allqueries.columns.get_loc(dup)] = ([dup + '.' + str(d_idx) 
                                     if d_idx != 0 
                                     else dup 
                                     for d_idx in range(allqueries.columns.get_loc(dup).sum())]
                                    )
allqueries.columns=cols

#rename to proper names

allqueries.rename({'query': 'top query', 'value': 'top query value', 'query.1': 'related query', 'value.1': 'related query value'}, axis=1, inplace=True) 

#check your dataset
allqueries.head()

                        query  value
0          george santos news    100
1    republican george santos     84
2          george santos drag     59
3                       trump     40
4         george santos trump     40
5            george de santos     33
6        who is george santos     32
7    drag queen george santos     31
8                 santos lies     29
9          george santos lies     29
10  congressman george santos     28
11          george santos gay     27
12      twitter george santos     26
13     george santos congress     26
14     george santos new york     21
15     george santos district     21
16           george santos ny     21
17         george santos name     16
18          george santos cnn     15
19                        cnn     15
20                george best     15
21                 trump news     14
22       is george santos gay     13
23       george santos brazil     12
24                   mccarthy     12
                                   que

Unnamed: 0,top query,top query value,related query,related query value
0,george santos news,100,republican george santos,402050
1,republican george santos,84,george santos drag,282950
2,george santos drag,59,santos lies,140750
3,trump,40,congressman george santos,134900
4,george santos trump,40,george santos congress,122900
