In [None]:
import pandas as pd
import requests
import json
import csv
import time
import datetime
import matplotlib.pyplot as plt
import networkx as nx 
import seaborn as sns 

%matplotlib inline 

In [None]:
# Do TF-IDF for these three months in the two datasets
# The tokenizer packages
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string

stopWords = set(stopwords.words('english')+ list(string.punctuation))

# The TF-IDF packages
from sklearn.feature_extraction.text import TfidfVectorizer

# Doing TF-IDF on the body of that month. 
def robustParse(text,replaceNL=True):
    """Helper function for BeautifulSoup parsing of comment text"""
    
    try: 
        if replaceNL: NL = "\n"
        else: NL = " "
        return bs4.BeautifulSoup(text, "lxml").text.replace(f"{NL}"," ")
    except: 
        return None 
    
def tokenize(text,remove_stop=True, lower=True, stem=True, sep=" "):
    """Function to consolidate a number of text cleaning methods"""
    
    if lower: text = text.lower()
    
    tokens = word_tokenize(text)  #  Use NLTK's tokenizer
    
    if remove_stop:
        tokens = [item for item in tokens if item not in stopWords]

    if stem:
        return sep.join([PorterStemmer().stem(item) for item in tokens])

    else:
        return sep.join(tokens)

    
def getTFIDF(text_series):
    """Performs TF-IDF calculation on a matrix of words grouped into cells."""
    vectorizer = TfidfVectorizer(token_pattern=r'[^\s]+')

    matrix = vectorizer.fit_transform(text_series).todense()

    matrix_df = pd.DataFrame(matrix, columns=vectorizer.get_feature_names())

    # sum over each document (axis=0)
    return matrix_df.sum(axis=0).sort_values(ascending=False)

In [None]:
gme12529 = pd.read_csv('gme/gme12529.csv',dtype=str)
cgme12529 = gme12529[(gme12529['Body']!='[removed]') & (gme12529['Body']!='[deleted]') & (gme12529['Author']!='[deleted]')]

In [None]:
len(cgme12529)

In [None]:
len(gme12529)

In [None]:
cgme12529['Author'].value_counts()

In [None]:
c1gme12529 = cgme12529[~cgme12529['Body'].isna()]

In [None]:
type(cgme12529["Title"])

In [None]:
peak1corpus = pd.concat([cgme12529["Title"],c1gme12529['Body']])
peak1corpus

In [None]:
peak1topwords = getTFIDF(peak1corpus.map(tokenize))
peak1topwords

In [None]:
peak1topwordsdf = pd.DataFrame(peak1topwords)
peak1topwordsdf.to_csv('/Users/elaine/Desktop/peak1topwords.csv')

In [None]:
for i in range(1000,5001,1000):
    exec(f"peak1por{i} = pd.read_csv('1st gme comments/random/gme12529author{i}.csv')")

In [None]:
for i in range(1000,27001,1000):
    exec(f"peak1po{i} = pd.read_csv('1st gme comments/gme12529authorrest{i}.csv')")

In [None]:
peak1po27814 = pd.read_csv('1st gme comments/gme12529authorrest27814.csv')

In [None]:
peak1aucomments = pd.concat([peak1por1000, peak1por2000,peak1por3000,peak1por4000,peak1por5000,peak1po1000,peak1po2000
                          ,peak1po3000,peak1po4000,peak1po5000,peak1po6000,peak1po7000,peak1po8000,peak1po9000,peak1po10000
                         ,peak1po11000,peak1po12000,peak1po13000,peak1po14000,peak1po15000,peak1po16000,peak1po17000
                         ,peak1po18000,peak1po19000,peak1po20000,peak1po21000,peak1po22000,peak1po23000,peak1po24000
                          ,peak1po25000,peak1po26000,peak1po27000,peak1po27814])

In [None]:
len(peak1aucomments)

In [None]:
peak1aucomments.Author.value_counts()

In [None]:
# first time commented in wsb
from datetime import datetime
peak1aucomments["Publish Date"] = peak1aucomments['Publish Date'].map(lambda datetext: datetime.strptime(datetext,'%Y-%m-%d %H:%M:%S') )

In [None]:
peak1au1stcom = peak1aucomments.sort_values("Publish Date").drop_duplicates("Author",keep='first')
peak1aulastcom = peak1aucomments.sort_values("Publish Date",ascending=False).drop_duplicates("Author",keep='first')

In [None]:
sumpeak1au = peak1au1stcom[['Author','Publish Date']]
sumpeak1au.rename(columns={'Author':'Author','Publish Date':'first_commented_in_wsb'},inplace = True)
len(sumpeak1au)

In [None]:
sumpeak1au = sumpeak1au.merge(peak1aulastcom[['Author','Publish Date']],left_on='Author',right_on='Author',how='left')
sumpeak1au.rename(columns={'Publish Date':'last_commented_in_wsb'},inplace=True)
print(len(sumpeak1au))

In [None]:
for i in range(1000,5001,1000):
    exec(f"peak1por{i} = pd.read_csv('1st gme posts/random/gme12529authorpost{i}.csv')")

In [None]:
for i in range(1000,27001,1000):
    exec(f"peak1po{i} = pd.read_csv('1st gme posts/gme12529restaupost{i}.csv')")

In [None]:
peak1po27814 = pd.read_csv('1st gme posts/gme12529restaupost27814.csv')

In [None]:
peak1auposts = pd.concat([peak1por1000, peak1por2000,peak1por3000,peak1por4000,peak1por5000,peak1po1000,peak1po2000
                          ,peak1po3000,peak1po4000,peak1po5000,peak1po6000,peak1po7000,peak1po8000,peak1po9000,peak1po10000
                         ,peak1po11000,peak1po12000,peak1po13000,peak1po14000,peak1po15000,peak1po16000,peak1po17000
                         ,peak1po18000,peak1po19000,peak1po20000,peak1po21000,peak1po22000,peak1po23000,peak1po24000
                          ,peak1po25000,peak1po26000,peak1po27000,peak1po27814])

In [None]:
peak1auposts["created"] = peak1auposts['created'].map(lambda datetext: datetime.strptime(datetext,'%Y-%m-%d %H:%M:%S') )

In [None]:
len(peak1auposts)

In [None]:
peak1auposts.author.value_counts()

In [None]:
#first time posted in wsb
peak1au1stpo = peak1auposts.sort_values("created").drop_duplicates("author",keep='first')
peak1au1stpo = peak1au1stpo[['author','created']]
sumpeak1au = sumpeak1au.merge(peak1au1stpo[peak1au1stpo.created < '2021-01-30 00:00:00'],left_on='Author',right_on='author',how='outer')
sumpeak1au.rename(columns={'created':'first_posted_in_wsb'},inplace = True)
sumpeak1au

In [None]:
sumpeak1au = sumpeak1au[~sumpeak1au.author.isna()]
print(len(sumpeak1au))

In [None]:
del sumpeak1au['Author']

In [None]:
#last time posted in wsb
peak1aulastpo = peak1auposts.sort_values("created",ascending=False).drop_duplicates("author",keep='first')
sumpeak1au = sumpeak1au.merge(peak1aulastpo[['author','created']],left_on='author',right_on='author',how='left')
sumpeak1au.rename(columns={'created':'last_posted_in_wsb'},inplace=True)
print(len(sumpeak1au))

In [None]:
peak1augmepo = peak1auposts[(peak1auposts['title'].str.contains("GME|Gamestop|GameStop|GAMESTOP|gamestop|Gamestop's|gme|GameStop's",case=False)) | (peak1auposts['body'].str.contains("GME|Gamestop|GameStop|GAMESTOP|gamestop|Gamestop's|gme|GameStop's",case=False))]
peak11stpogme = peak1augmepo.sort_values("created").drop_duplicates("author",keep='first')
peak1lastpogme = peak1augmepo.sort_values("created",ascending=False).drop_duplicates("author",keep='first')
sumpeak1au = sumpeak1au.merge(peak11stpogme[['author','created']],left_on='author',right_on='author',how='inner')
sumpeak1au.rename(columns={'created':'fisrt_post_gme'},inplace=True)
sumpeak1au = sumpeak1au.merge(peak1lastpogme[['author','created']],left_on='author',right_on='author',how='inner')
sumpeak1au.rename(columns={'created':'last_post_gme'},inplace=True)
len(sumpeak1au)

In [None]:
firstpogmeinpeak1 = peak1augmepo[(peak1augmepo.created >= '2021-01-25 00:00:00') & (peak1augmepo.created < '2021-01-30 00:00:00')].sort_values("created").drop_duplicates("author",keep='first')
sumpeak1au = sumpeak1au.merge(firstpogmeinpeak1[['author','created']],left_on='author',right_on='author',how='inner')
sumpeak1au.rename(columns={'created':'fisrt_po_gme_inpeak1'},inplace=True)
len(sumpeak1au)

In [None]:
# first/last time post gme, post gme frequency, average comment number, average gme post score
gmepofreq = pd.DataFrame(peak1augmepo[(peak1augmepo.created >= '2021-01-25 00:00:00') & (peak1augmepo.created < '2021-01-30 00:00:00')].groupby(['author'])['sub_id'].count())
gmeposcore = pd.DataFrame(peak1augmepo[(peak1augmepo.created >= '2021-01-25 00:00:00') & (peak1augmepo.created < '2021-01-30 00:00:00')].groupby(['author'])['score'].mean())
gmepocomNo = pd.DataFrame(peak1augmepo[(peak1augmepo.created >= '2021-01-25 00:00:00') & (peak1augmepo.created < '2021-01-30 00:00:00')].groupby(['author'])['numComms'].mean())

In [None]:
gmeposumscore = pd.DataFrame(peak1augmepo[(peak1augmepo.created >= '2021-01-25 00:00:00') & (peak1augmepo.created < '2021-01-30 00:00:00')].groupby(['author'])['score'].sum())

In [None]:
sumpeak1au1 = sumpeak1au.merge(gmepofreq,left_on='author',right_on='author',how='inner')
len(sumpeak1au1)
sumpeak1au = sumpeak1au.merge(gmeposcore,left_on='author',right_on='author',how='inner')
sumpeak1au = sumpeak1au.merge(gmepocomNo,left_on='author',right_on='author',how='inner')
sumpeak1au.rename(columns={'sub_id':'No_gmepost','score':'gmepost_score','numComms':'No_gmepost_com'},inplace=True)
sumpeak1au

In [None]:
sumpeak1au = sumpeak1au.merge(gmeposumscore,left_on='author',right_on='author',how='inner')
sumpeak1au.rename(columns={'score':'gmepost_sumscore'},inplace=True)

In [None]:
# first/last time comment gme, comment gme frequency, average gme comment score

peak1augmecom = peak1aucomments.merge(sumpeak1au,left_on='Author',right_on='author',how='inner')
peak1augmecom['Reply to'] = peak1augmecom['Reply to'].apply(lambda x: x[3:])
gme124 = pd.read_csv('gme/gme124.csv')
gme130 = pd.read_csv('gme/gme130.csv')
gme3713 = pd.read_csv('gme/gme3713.csv')
gme11016 = pd.read_csv('gme/gme11016.csv')
gme11723 = pd.read_csv('gme/gme11723.csv')
gme12529 = pd.read_csv('gme/gme12529.csv')
gme21420 = pd.read_csv('gme/gme21420.csv')
gme22136 = pd.read_csv('gme/gme22136.csv')
gme31420 = pd.read_csv('gme/gme31420.csv')
gme131213 = pd.read_csv('gme/gme131213.csv')
gme122119 = pd.read_csv('gme/gme12-21.csv')
gme321531 = pd.read_csv('gme/gme321531.csv')
allgmeposts = pd.concat([gme122119,gme11016,gme11723,gme124,gme12529,gme130,gme131213,gme21420,gme22136,gme3713,gme31420,gme321531])
peak1augmecom = peak1augmecom.merge(allgmeposts[['Post ID','Title','Body','Flair']],left_on='Reply to',right_on='Post ID',how='left')
peak1augmecom = peak1augmecom[(~peak1augmecom['Post ID'].isna()) | (peak1augmecom['Body_x'].str.contains("GME|Gamestop|GameStop|GAMESTOP|gamestop|Gamestop's|gme|GameStop's",case=False))]



In [None]:
firstcomgmeinpeak1 = peak1augmecom[(peak1augmecom['Publish Date'] > '2021-01-25 00:00:00') & (peak1augmecom['Publish Date'] < '2021-01-30 00:00:00')].sort_values("Publish Date").drop_duplicates("Author",keep='first')
sumpeak1au = sumpeak1au.merge(firstcomgmeinpeak1[['Author','Publish Date']],left_on='author',right_on='Author',how='left')
sumpeak1au.rename(columns={'Publish Date':'fisrt_com_gme_inpeak1'},inplace=True)

In [None]:
print(len(peak1augmecom['Author'].value_counts()))
peak11stcomgme = peak1augmecom.sort_values("Publish Date").drop_duplicates("Author",keep='first')
print(len(peak11stcomgme))

In [None]:
peak1lastcomgme = peak1augmecom.sort_values("Publish Date",ascending=False).drop_duplicates("Author",keep='first')

In [None]:
sumpeak1au = sumpeak1au.merge(peak11stcomgme[['Author','Publish Date']],left_on='author',right_on='Author',how='left')
sumpeak1au.rename(columns={'Publish Date':'fisrt_com_gme'},inplace=True)
sumpeak1au = sumpeak1au.merge(peak1lastcomgme[['Author','Publish Date']],left_on='author',right_on='Author',how='left')
sumpeak1au.rename(columns={'Publish Date':'last_com_gme'},inplace=True)

print(len(sumpeak1au))

In [None]:
del sumpeak1au['Author_x']
del sumpeak1au['Author_y']

In [None]:
gmecomfreq = pd.DataFrame(peak1augmecom[(peak1augmecom['Publish Date'] > '2021-01-25 00:00:00') & (peak1augmecom['Publish Date']  < '2021-01-30 00:00:00')].groupby(['Author'])['Comment ID'].count())
gmecomscore = pd.DataFrame(peak1augmecom[(peak1augmecom['Publish Date'] > '2021-01-25 00:00:00') & (peak1augmecom['Publish Date']  < '2021-01-30 00:00:00')].groupby(['Author'])['Score'].mean())
sumpeak1au = sumpeak1au.merge(gmecomfreq,left_on='author',right_on='Author',how='left')
sumpeak1au = sumpeak1au.merge(gmecomscore,left_on='author',right_on='Author',how='left')
sumpeak1au.rename(columns={'Comment ID':'No_gmecom','Score':'gmecom_score'},inplace=True)
print(len(sumpeak1au))

In [None]:
#first/last post gme gain/loss,frequency of posting gme gain/loss, score of gme gain/loss, No of gme gain/loss comments

gmepoinpeak1 = peak1augmepo[(peak1augmepo.created > '2021-01-25 00:00:00') & (peak1augmepo.created < '2021-01-30 00:00:00')]

peak1gain = gmepoinpeak1[gmepoinpeak1['flair'] == 'Gain']

peak11stgain = peak1gain.sort_values("created").drop_duplicates("author",keep='first')
peak1lastgain = peak1gain.sort_values("created",ascending=False).drop_duplicates("author",keep='first')

gainfreq = pd.DataFrame(peak1gain.groupby(['author'])['sub_id'].count())
gainscore = pd.DataFrame(peak1gain.groupby(['author'])['score'].mean())
gaincomNo = pd.DataFrame(peak1gain.groupby(['author'])['numComms'].mean())


In [None]:
sumpeak1au = sumpeak1au.merge(peak11stgain[['author','created']],left_on='author',right_on='author',how='left')
sumpeak1au.rename(columns={'created':'fisrt_gain'},inplace=True)
sumpeak1au = sumpeak1au.merge(peak1lastgain[['author','created']],left_on='author',right_on='author',how='left')
sumpeak1au.rename(columns={'created':'last_gain'},inplace=True)
sumpeak1au = sumpeak1au.merge(gainfreq,left_on='author',right_on='author',how='left')
sumpeak1au = sumpeak1au.merge(gainscore,left_on='author',right_on='author',how='left')
sumpeak1au = sumpeak1au.merge(gaincomNo,left_on='author',right_on='author',how='left')
sumpeak1au.rename(columns={'sub_id':'No_gainpost','score':'gain_score','numComms':'No_gain_com'},inplace=True)
print(len(sumpeak1au))

In [None]:
# First in wsb / First mention gme
sumpeak1au['first_in_wsb'] = ''
for i,r in sumpeak1au.iterrows():
    if str(sumpeak1au['first_commented_in_wsb'][i]) == 'NaT':
        sumpeak1au['first_in_wsb'][i] = sumpeak1au['first_posted_in_wsb'][i]
    else:
        if sumpeak1au['first_posted_in_wsb'][i] < sumpeak1au['first_commented_in_wsb'][i]:
            sumpeak1au['first_in_wsb'][i] = sumpeak1au['first_posted_in_wsb'][i]
        else:
            sumpeak1au['first_in_wsb'][i] = sumpeak1au['first_commented_in_wsb'][i]

sumpeak1au['last_in_wsb'] = ''
for i,r in sumpeak1au.iterrows():
    if str(sumpeak1au['last_commented_in_wsb'][i]) == 'NaT':
        sumpeak1au['last_in_wsb'][i] = sumpeak1au['last_posted_in_wsb'][i]
    else:
        if sumpeak1au['last_posted_in_wsb'][i] < sumpeak1au['last_commented_in_wsb'][i]:
            sumpeak1au['last_in_wsb'][i] = sumpeak1au['last_commented_in_wsb'][i]
        else:
            sumpeak1au['last_in_wsb'][i] = sumpeak1au['last_posted_in_wsb'][i]

sumpeak1au['first_gme'] = ''
for i,r in sumpeak1au.iterrows():
    if str(sumpeak1au['fisrt_com_gme'][i]) == 'NaT':
        sumpeak1au['first_gme'][i] = sumpeak1au['fisrt_post_gme'][i]
    else:
        if sumpeak1au['fisrt_post_gme'][i] < sumpeak1au['fisrt_com_gme'][i]:
            sumpeak1au['first_gme'][i] = sumpeak1au['fisrt_post_gme'][i]
        else:
            sumpeak1au['first_gme'][i] = sumpeak1au['fisrt_com_gme'][i]

sumpeak1au['last_gme'] = ''
for i,r in sumpeak1au.iterrows():
    if str(sumpeak1au['last_com_gme'][i]) == 'NaT':
        sumpeak1au['last_gme'][i] = sumpeak1au['last_post_gme'][i]
    else:
        if sumpeak1au['last_post_gme'][i] < sumpeak1au['last_com_gme'][i]:
            sumpeak1au['last_gme'][i] = sumpeak1au['last_com_gme'][i]
        else:
            sumpeak1au['last_gme'][i] = sumpeak1au['last_post_gme'][i]

sumpeak1au['first_gme_inpeak1'] = ''
for i,r in sumpeak1au.iterrows():
    if str(sumpeak1au['fisrt_com_gme_inpeak1'][i]) == 'NaT':
        sumpeak1au['first_gme_inpeak1'][i] = sumpeak1au['fisrt_po_gme_inpeak1'][i]
    else:
        if sumpeak1au['fisrt_po_gme_inpeak1'][i] < sumpeak1au['fisrt_com_gme_inpeak1'][i]:
            sumpeak1au['first_gme_inpeak1'][i] = sumpeak1au['fisrt_po_gme_inpeak1'][i]
        else:
            sumpeak1au['first_gme_inpeak1'][i] = sumpeak1au['fisrt_com_gme_inpeak1'][i]

In [None]:
# categorize pre-gme / peak1 users (keyline: Jan 25th)
# pre-gme=0 peak1 = 1
sumpeak1au['first_cate'] = ''
for i,r in sumpeak1au.iterrows():
    if sumpeak1au['first_in_wsb'][i] < datetime(year=2021, month=1, day=25, hour=0, minute=0, second=0):
        sumpeak1au['first_cate'][i] = 0
    else:
        sumpeak1au['first_cate'][i] = 1

In [None]:
# New vs. Old
print(len(sumpeak1au[sumpeak1au['first_cate'] == 1])/29944)

print(len(sumpeak1au[sumpeak1au['first_posted_in_wsb'] > '2021-01-25 00:00:00']))
print(len(sumpeak1au[sumpeak1au['first_posted_in_wsb'] > '2021-01-25 00:00:00'])/29944)

print(len(sumpeak1au[sumpeak1au['first_commented_in_wsb'] > '2021-01-25 00:00:00']))
print(len(sumpeak1au[sumpeak1au['first_commented_in_wsb'] > '2021-01-25 00:00:00'])/29944)

print(len(sumpeak1au[sumpeak1au['first_in_wsb'] > datetime(year=2021, month=1, day=25, hour=0, minute=0, second=0)]))
print(len(sumpeak1au[sumpeak1au['first_in_wsb'] > datetime(year=2021, month=1, day=25, hour=0, minute=0, second=0)])/29944)

In [None]:
tpeak1au1stpo = sumpeak1au.set_index("first_in_wsb")
tpeak1au1stpo.groupby(pd.Grouper(freq='M'))['author'].count().plot(label='number of authors')
plt.legend()
plt.savefig('/Users/elaine/Desktop/firstinwsb.png',bbox_inches = 'tight')
plt.show()

In [None]:
sumpeak1au['post_length']=(datetime(year=2021, month=1, day=30, hour=23, minute=59, second=59)-sumpeak1au['first_posted_in_wsb']).apply(lambda x: x.days)

In [None]:
sumpeak1au['comment_length']=(datetime(year=2021, month=1, day=30, hour=23, minute=59, second=59)-sumpeak1au['first_commented_in_wsb']).apply(lambda x: x.days)

In [None]:
sumpeak1au['content_length']=(datetime(year=2021, month=1, day=30, hour=23, minute=59, second=59)-sumpeak1au['first_in_wsb']).apply(lambda x: x.days)

In [None]:
print(sumpeak1au['post_length'].mean())
print(sumpeak1au['post_length'].median())
print(sumpeak1au['post_length'].std())

print(sumpeak1au['comment_length'].mean())
print(sumpeak1au['comment_length'].median())
print(sumpeak1au['comment_length'].std())

print(sumpeak1au['content_length'].mean())
print(sumpeak1au['content_length'].median())
print(sumpeak1au['content_length'].std())

In [None]:
#when they first post gme
sumpeak1au['firstgme_length']=(datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0)-sumpeak1au['first_gme']).apply(lambda x: x.days)
print(sumpeak1au['firstgme_length'].mean())
print(sumpeak1au['firstgme_length'].median())
print(sumpeak1au['firstgme_length'].std())

In [None]:
tfg = sumpeak1au[sumpeak1au['first_gme'] > datetime(year=2020, month=1, day=1, hour=0, minute=0, second=0)]
tfg = tfg.set_index('first_gme')
tfg.groupby(pd.Grouper(freq='W'))['author'].count().plot(label='number of authors by week after 2020')
#plt.xlim(left=2021)
plt.legend()
plt.savefig('/Users/elaine/Desktop/firstgme.png',bbox_inches = 'tight')
plt.show()

In [None]:
peak1newcomers = sumpeak1au[sumpeak1au['first_cate'] == 1]
print(len(peak1newcomers))
peak1oldmembers = sumpeak1au[sumpeak1au['first_cate'] == 0]
print(len(peak1oldmembers))

In [None]:
print(len(peak1newcomers[peak1newcomers['last_in_wsb'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)]))
print(len(peak1newcomers[peak1newcomers['last_in_wsb'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)])/19660)
print(len(peak1oldmembers[peak1oldmembers['last_in_wsb'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)]))
print(len(peak1oldmembers[peak1oldmembers['last_in_wsb'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)])/10284)

In [None]:
print(len(peak1newcomers[peak1newcomers['last_gme'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)]))
print(len(peak1newcomers[peak1newcomers['last_gme'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)])/19660)
print(len(peak1oldmembers[peak1oldmembers['last_gme'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)]))
print(len(peak1oldmembers[peak1oldmembers['last_gme'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)])/10284)

In [None]:
len(peak1oldmembers[(peak1oldmembers['first_gme'] > datetime(year=2021, month=1, day=25, hour=0, minute=0, second=0)) & (peak1oldmembers['last_gme'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0))])


In [None]:
2633/10284

In [None]:
#never wsb after 1.29
print((10522+2263)/29944)
#never GME after 1.29
print((13710+3785)/29944)
#only GME 1.25-29
print((13710+2633)/29944)

In [None]:
#单看下post
print(len(peak1newcomers[peak1newcomers['last_posted_in_wsb'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)]))
print(len(peak1newcomers[peak1newcomers['last_posted_in_wsb'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)])/19660)
print(len(peak1oldmembers[peak1oldmembers['last_posted_in_wsb'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)]))
print(len(peak1oldmembers[peak1oldmembers['last_posted_in_wsb'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)])/10284)

In [None]:
print(len(peak1newcomers[peak1newcomers['last_post_gme'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)]))
print(len(peak1newcomers[peak1newcomers['last_post_gme'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)])/19660)
print(len(peak1oldmembers[peak1oldmembers['last_post_gme'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)]))
print(len(peak1oldmembers[peak1oldmembers['last_post_gme'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)])/10284)

In [None]:
len(peak1oldmembers[(peak1oldmembers['fisrt_post_gme'] > datetime(year=2021, month=1, day=25, hour=0, minute=0, second=0)) & (peak1oldmembers['last_post_gme'] < datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0))])


In [None]:
7154/10284

In [None]:
# care GME after 129
peak1augme129 = sumpeak1au[sumpeak1au['last_gme'] >= datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0)]
len(peak1augme129)

In [None]:
peak1augme129['care_length'] = (peak1augme129['last_gme'] - datetime(year=2021, month=1, day=29, hour=0, minute=0, second=0)).apply(lambda x: x.days)
print(peak1augme129['care_length'].mean())
print(peak1augme129['care_length'].median())
print(peak1augme129['care_length'].std())
print(peak1augme129['care_length'].min())
print(peak1augme129['care_length'].max())

In [None]:
peak1augme129['wsb_length'] = (peak1augme129['last_in_wsb'] - datetime(year=2021, month=1, day=29, hour=0, minute=0, second=0)).apply(lambda x: x.days)
print(peak1augme129['wsb_length'].mean())
print(peak1augme129['wsb_length'].median())
print(peak1augme129['wsb_length'].std())
print(peak1augme129['wsb_length'].min())
print(peak1augme129['wsb_length'].max())

In [None]:
print(peak1augme129[peak1augme129['first_cate'] == 0]['care_length'].mean())
print(peak1augme129[peak1augme129['first_cate'] == 0]['care_length'].median())
print(peak1augme129[peak1augme129['first_cate'] == 0]['care_length'].std())

print(peak1augme129[peak1augme129['first_cate'] == 1]['care_length'].mean())
print(peak1augme129[peak1augme129['first_cate'] == 1]['care_length'].median())
print(peak1augme129[peak1augme129['first_cate'] == 1]['care_length'].std())

In [None]:
print(peak1augme129[peak1augme129['first_cate'] == 0]['wsb_length'].mean())
print(peak1augme129[peak1augme129['first_cate'] == 0]['wsb_length'].median())
print(peak1augme129[peak1augme129['first_cate'] == 0]['wsb_length'].std())

print(peak1augme129[peak1augme129['first_cate'] == 1]['wsb_length'].mean())
print(peak1augme129[peak1augme129['first_cate'] == 1]['wsb_length'].median())
print(peak1augme129[peak1augme129['first_cate'] == 1]['wsb_length'].std())

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import math
sns.set()

In [None]:
peak1augme129

In [None]:
y = peak1augme129['first_cate']
x1= peak1augme129['care_length']

x = sm.add_constant(x1)
x.shape

results_log = sm.Logit(y.astype(float),x.astype(float)).fit()
print(results_log.summary())

cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1: 'Actual 1'})
display(cm_df)

# Model Accuracy
cm = np.array(cm_df)
accuracy_train = 100*(cm[0,0]+cm[1,1])/cm.sum()
print ('The model accuracy based is {:.5}'.format(accuracy_train))

In [None]:
1/math.exp(-0.0121)
# Users who concerned GME for longer time are 1.012 times more likely to be old members.

In [None]:
y = peak1augme129['first_cate']
x1= peak1augme129['wsb_length']

x = sm.add_constant(x1)
x.shape

results_log = sm.Logit(y.astype(float),x.astype(float)).fit()
print(results_log.summary())

cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1: 'Actual 1'})
display(cm_df)

# Model Accuracy
cm = np.array(cm_df)
accuracy_train = 100*(cm[0,0]+cm[1,1])/cm.sum()
print ('The model accuracy based is {:.5}'.format(accuracy_train))

In [None]:
1/math.exp(-0.0141)
# Users who stayed in wsb for longer time are 1.014 times more likely to be old members.

In [None]:
peak1gain = pd.DataFrame(peak1gain['author'].value_counts())

In [None]:
print(len(peak1gain))

In [None]:
peak1gaingme = pd.merge(peak1gain,sumpeak1au[['author','last_gme','last_in_wsb','first_cate']],left_index=True,right_on='author',how='inner')
print(len(peak1gaingme))
print(len(peak1gaingme[peak1gaingme['last_gme'] <= datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)]))
print(len(peak1gaingme[peak1gaingme['last_gme'] <= datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)])/2092)
print(len(peak1gaingme[peak1gaingme['last_in_wsb'] <= datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)]))
print(len(peak1gaingme[peak1gaingme['last_in_wsb'] <= datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)])/2092)

In [None]:
peak1gaingme129 = peak1gaingme[peak1gaingme['last_gme'] > datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)]
print(len(peak1gaingme129))
peak1gaingme129['care_length'] = (peak1gaingme129['last_gme'] - datetime(year=2021, month=1, day=29, hour=0, minute=0, second=0)).apply(lambda x: x.days)
peak1gaingme129['wsb_length'] = (peak1gaingme129['last_in_wsb'] - datetime(year=2021, month=1, day=29, hour=0, minute=0, second=0)).apply(lambda x: x.days)
print(peak1gaingme129['care_length'].mean())
print(peak1gaingme129['care_length'].median())
print(peak1gaingme129['care_length'].std())
print(peak1gaingme129['wsb_length'].mean())
print(peak1gaingme129['wsb_length'].median())
print(peak1gaingme129['wsb_length'].std())

In [None]:
print(peak1gaingme129[peak1gaingme129['first_cate'] == 0]['care_length'].mean())
print(peak1gaingme129[peak1gaingme129['first_cate'] == 0]['care_length'].median())
print(peak1gaingme129[peak1gaingme129['first_cate'] == 0]['care_length'].std())

print(peak1gaingme129[peak1gaingme129['first_cate'] == 1]['care_length'].mean())
print(peak1gaingme129[peak1gaingme129['first_cate'] == 1]['care_length'].median())
print(peak1gaingme129[peak1gaingme129['first_cate'] == 1]['care_length'].std())

In [None]:
print(peak1gaingme129[peak1gaingme129['first_cate'] == 0]['wsb_length'].mean())
print(peak1gaingme129[peak1gaingme129['first_cate'] == 0]['wsb_length'].median())
print(peak1gaingme129[peak1gaingme129['first_cate'] == 0]['wsb_length'].std())

print(peak1gaingme129[peak1gaingme129['first_cate'] == 1]['wsb_length'].mean())
print(peak1gaingme129[peak1gaingme129['first_cate'] == 1]['wsb_length'].median())
print(peak1gaingme129[peak1gaingme129['first_cate'] == 1]['wsb_length'].std())## 

In [None]:
y = peak1gaingme129['first_cate']
x1= peak1gaingme129['care_length']

x = sm.add_constant(x1)
x.shape

results_log = sm.Logit(y.astype(float),x.astype(float)).fit()
print(results_log.summary())

cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1: 'Actual 1'})
display(cm_df)

# Model Accuracy
cm = np.array(cm_df)
accuracy_train = 100*(cm[0,0]+cm[1,1])/cm.sum()
print ('The model accuracy based is {:.5}'.format(accuracy_train))

In [None]:
1/math.exp(-0.0153)

In [None]:
y = peak1gaingme129['first_cate']
x1= peak1gaingme129['wsb_length']

x = sm.add_constant(x1)
x.shape

results_log = sm.Logit(y.astype(float),x.astype(float)).fit()
print(results_log.summary())

cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1: 'Actual 1'})
display(cm_df)

# Model Accuracy
cm = np.array(cm_df)
accuracy_train = 100*(cm[0,0]+cm[1,1])/cm.sum()
print ('The model accuracy based is {:.5}'.format(accuracy_train))

In [None]:
peak1yolo = peak1augmepo[peak1augmepo['flair'] == 'YOLO']
peak11yolo = peak1yolo[(peak1yolo['created'] > '2021-01-25 00:00:00') & (peak1yolo['created'] < '2021-01-30 00:00:00')]['author'].value_counts()
peak11yolo = pd.DataFrame(peak11yolo)
len(peak11yolo)

In [None]:
peak1yologme = pd.merge(peak11yolo,sumpeak1au[['author','last_gme','last_in_wsb','first_cate']],left_index=True,right_on='author',how='inner')
len(peak1yologme)

In [None]:
print(len(peak1yologme[peak1yologme['last_gme'] <= datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)]))
print(len(peak1yologme[peak1yologme['last_gme'] <= datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)])/6636)
print(len(peak1yologme[peak1yologme['last_in_wsb'] <= datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)]))
print(len(peak1yologme[peak1yologme['last_in_wsb'] <= datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)])/6636)

In [None]:
peak1yologme129 = peak1yologme[peak1yologme['last_gme'] > datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)]
print(len(peak1yologme129))
peak1yologme129['care_length'] = (peak1yologme129['last_gme'] - datetime(year=2021, month=1, day=29, hour=0, minute=0, second=0)).apply(lambda x: x.days)
peak1yologme129['wsb_length'] = (peak1yologme129['last_in_wsb'] - datetime(year=2021, month=1, day=29, hour=0, minute=0, second=0)).apply(lambda x: x.days)
print(peak1yologme129['care_length'].mean())
print(peak1yologme129['care_length'].median())
print(peak1yologme129['care_length'].std())
print(peak1yologme129['wsb_length'].mean())
print(peak1yologme129['wsb_length'].median())
print(peak1yologme129['wsb_length'].std())

In [None]:
print(peak1yologme129[peak1yologme129['first_cate'] == 0]['care_length'].mean())
print(peak1yologme129[peak1yologme129['first_cate'] == 0]['care_length'].median())
print(peak1yologme129[peak1yologme129['first_cate'] == 0]['care_length'].std())

print(peak1yologme129[peak1yologme129['first_cate'] == 1]['care_length'].mean())
print(peak1yologme129[peak1yologme129['first_cate'] == 1]['care_length'].median())
print(peak1yologme129[peak1yologme129['first_cate'] == 1]['care_length'].std())

In [None]:
print(peak1yologme129[peak1yologme129['first_cate'] == 0]['wsb_length'].mean())
print(peak1yologme129[peak1yologme129['first_cate'] == 0]['wsb_length'].median())
print(peak1yologme129[peak1yologme129['first_cate'] == 0]['wsb_length'].std())

print(peak1yologme129[peak1yologme129['first_cate'] == 1]['wsb_length'].mean())
print(peak1yologme129[peak1yologme129['first_cate'] == 1]['wsb_length'].median())
print(peak1yologme129[peak1yologme129['first_cate'] == 1]['wsb_length'].std())

In [None]:
y = peak1yologme129['first_cate']
x1= peak1yologme129['care_length']

x = sm.add_constant(x1)
x.shape

results_log = sm.Logit(y.astype(float),x.astype(float)).fit()
print(results_log.summary())

cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1: 'Actual 1'})
display(cm_df)

# Model Accuracy
cm = np.array(cm_df)
accuracy_train = 100*(cm[0,0]+cm[1,1])/cm.sum()
print ('The model accuracy based is {:.5}'.format(accuracy_train))

In [None]:
y = peak1yologme129['first_cate']
x1= peak1yologme129['wsb_length']

x = sm.add_constant(x1)
x.shape

results_log = sm.Logit(y.astype(float),x.astype(float)).fit()
print(results_log.summary())

cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1: 'Actual 1'})
display(cm_df)

# Model Accuracy
cm = np.array(cm_df)
accuracy_train = 100*(cm[0,0]+cm[1,1])/cm.sum()
print ('The model accuracy based is {:.5}'.format(accuracy_train))

In [None]:
math.exp(-0.0118)

In [None]:
#1.29前发过内容，1.30-3.7之间没有，3.8-3.12之间发过内容
peak1peak2 = pd.merge(peak1augmepo[['author','created']],sumpeak1au[['author','first_cate']],left_on='author',right_on='author',how='inner')
len(peak1peak2)

In [None]:
peak1peak2 = peak1peak2.merge(peak1augmecom[['Author','Publish Date']],left_on='author',right_on='Author',how='left')

In [None]:
#1.30-3.7发过GME言的人
peak12po = peak1peak2[(peak1peak2['created'] > '2021-01-29 23:59:59') & (peak1peak2['created'] <= '2021-03-07 23:59:59')]['author'].value_counts()
peak12com = peak1peak2[(peak1peak2['Publish Date'] > '2021-01-29 23:59:59') & (peak1peak2['Publish Date'] <= '2021-03-07 23:59:59')]['author'].value_counts()
def Union(lst1, lst2):
    final_list = list(set(lst1) | set(lst2))
    return final_list
peak12au13037 = Union(peak12po.index.tolist(),peak12com.index.tolist())


In [None]:
#3.8-3.12发过GME言的人
peak12po3812 = peak1peak2[(peak1peak2['created'] > '2021-03-07 23:59:59') & (peak1peak2['created'] <= '2021-03-12 23:59:59')]['author'].value_counts()
peak12com3812 = peak1peak2[(peak1peak2['Publish Date'] > '2021-03-07 23:59:59') & (peak1peak2['Publish Date'] <= '2021-03-12 23:59:59')]['author'].value_counts()
def Union(lst1, lst2):
    final_list = list(set(lst1) | set(lst2))
    return final_list
peak12au3812 = Union(peak12po3812.index.tolist(),peak12com3812.index.tolist())

In [188]:
peak12au = []
for i in peak12au3812:
    if i in peak12au13037:
        continue
    else:
        peak12au.append(i)
len(peak12au)

304

In [None]:
304/12449

In [189]:
peak12au = pd.DataFrame(peak12au,columns=['author'])

In [191]:
peak12au = peak12au.merge(sumpeak1au[['author','last_gme','last_in_wsb','first_cate']],left_on='author', right_on='author',how='inner')

In [193]:
peak12au[peak12au['first_cate'] == 1]

Unnamed: 0,author,last_gme,last_in_wsb,first_cate
0,kevlen123,2021-03-11 07:08:29,2021-03-11 07:08:29,1
1,Nahsor107,2021-04-01 05:11:57,2021-04-01 05:11:57,1
2,Tophurkey,2021-04-14 23:27:05,2021-04-14 23:27:05,1
4,mju516,2021-04-16 22:32:23,2021-04-16 22:32:23,1
5,jbronnier,2021-03-09 01:42:39,2021-03-09 01:42:39,1
...,...,...,...,...
296,ThatBoiZach,2021-03-11 02:06:01,2021-03-11 02:06:01,1
298,ijustwant2feelbetter,2021-03-17 12:21:42,2021-03-17 12:21:42,1
299,thebirdsnthebeemovie,2021-03-09 05:07:07,2021-03-09 05:07:07,1
300,NYJets18,2021-03-11 01:43:59,2021-03-11 01:43:59,1


In [None]:
# how many people used mooon in GME content 次数 / 第一次的时间 / ave upvotes(post+comment 总数/总数量)
moongmepost = peak1augmepo[(peak1augmepo.body.str.contains('moon|Moon|MOON',case=False)) | (peak1augmepo.title.str.contains('moon|Moon|MOON',case=False))]
moongmecom = peak1augmecom[(~peak1augmecom.Body_x.isna()) & (peak1augmecom.Body_x.str.contains('moon|Moon|MOON',case=False))]
moongmepost = moongmepost[['author','created','sub_id']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
moongmecom = moongmecom[['Author','Publish Date','Comment ID']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='Author',right_on='author',how='inner')
del moongmecom['author']
moongmecomfreq = pd.DataFrame(moongmecom.groupby(['Author'])['Comment ID'].count())
moongmecomfreq.rename(columns={'Comment ID':'No_gmecom_moon'},inplace=True)
moongmepofreq = pd.DataFrame(moongmepost.groupby(['author'])['sub_id'].count())
moongmepofreq.rename(columns={'sub_id':'No_gmepo_moon'},inplace=True)
moongmeplotpo = moongmepost[['created','sub_id']]
moongmeplotpo.rename(columns={'sub_id':'id'},inplace=True)
moongmeplotcom = moongmecom[['Publish Date','Comment ID']]
moongmeplotcom.rename(columns={'Publish Date':'created','Comment ID':'id'},inplace=True)
moongmeplot = pd.concat([moongmeplotpo,moongmeplotcom])
print(len(moongmeplot))

In [None]:
print(len(sumpeak1au[sumpeak1au['first_gme'] > datetime(year=2021, month=1, day=10, hour=0, minute=0, second=0)]))

In [None]:
print(len(sumpeak1au))

In [None]:
123-(31+30+31+28)

In [None]:
result = sm.ols(formula='firstgme_length ~ truecontent_length', data=sumpeak1au).fit()
print(result.summary())

In [None]:
#first and last gme post - when they step out
sumpeak1au['gmepost_length']=(sumpeak1au['last_post_gme']-sumpeak1au['fisrt_post_gme']).apply(lambda x: x.days)
print(sumpeak1au['gmepost_length'].mean())
print(sumpeak1au['gmepost_length'].median())
print(sumpeak1au['gmepost_length'].std())

In [None]:
np.percentile(sumpeak1au['gmepost_length'], 75)

In [None]:
print(len(sumpeak1au[(sumpeak1au['fisrt_post_gme']>'2021-01-25 00:00:00') & (sumpeak1au["last_post_gme"]<'2021-01-30 00:00:00')]))
print(len(sumpeak1au[(sumpeak1au['fisrt_post_gme']>'2021-01-25 00:00:00') & (sumpeak1au["last_post_gme"]<'2021-01-30 00:00:00')])/30013)


In [None]:
print(len(sumpeak1au[~sumpeak1au['fisrt_com_gme'].isna()]))
print(len(sumpeak1au[~sumpeak1au['fisrt_com_gme'].isna()])/30013)

In [None]:
sumpeak1au['gmecom_length']=(sumpeak1au['last_com_gme']-sumpeak1au['fisrt_com_gme']).apply(lambda x: x.days)

In [None]:
print(sumpeak1au['gmecom_length'].mean())
print(sumpeak1au['gmecom_length'].median())
print(sumpeak1au['gmecom_length'].std())

In [None]:
np.percentile(sumpeak1au[~sumpeak1au['gmecom_length'].isna()]['gmecom_length'], 55)

In [None]:
print(len(sumpeak1au[(sumpeak1au['fisrt_com_gme']>'2021-01-25 00:00:00') & (sumpeak1au["last_com_gme"]<'2021-01-30 00:00:00')]))
print(len(sumpeak1au[(sumpeak1au['fisrt_com_gme']>'2021-01-25 00:00:00') & (sumpeak1au["last_com_gme"]<'2021-01-30 00:00:00')])/17112)


In [None]:
sumpeak1au['gme_length']=(sumpeak1au['last_gme']-sumpeak1au['first_gme']).apply(lambda x: x.days)
print(sumpeak1au['gme_length'].mean())
print(sumpeak1au['gme_length']..median())
print(sumpeak1au['gme_length'].std())

In [None]:
import numpy as np
np.percentile(sumpeak1au['gme_length'], 75)

In [None]:
print(len(sumpeak1au[(sumpeak1au['first_gme']>datetime(year=2021, month=1, day=25, hour=0, minute=0, second=0)) & (sumpeak1au["last_gme"]<datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0))]))
print(len(sumpeak1au[(sumpeak1au['first_gme']>datetime(year=2021, month=1, day=25, hour=0, minute=0, second=0)) & (sumpeak1au["last_gme"]<datetime(year=2021, month=1, day=30, hour=0, minute=0, second=0))])/30013)


In [None]:
# people who still cared GME after 1.29

print(len(sumpeak1au[sumpeak1au['last_gme'] > datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)]))
print(len(sumpeak1au[sumpeak1au['last_gme'] > datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)])/30013)

In [None]:
peak1augme129 = sumpeak1au[sumpeak1au['last_gme'] > datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)]
peak1augme129['care_length'] = (peak1augme129['last_gme'] - datetime(year=2021, month=1, day=29, hour=0, minute=0, second=0)).apply(lambda x: x.days)


In [None]:
peak1augme129['care_length'].value_counts().plot.bar(label='number of users')
plt.legend()
plt.show()

In [None]:
print(peak1augme129['care_length'].mean())
print(peak1augme129['care_length'].median())
print(peak1augme129['care_length'].std())
print(peak1augme129['care_length'].min())
print(peak1augme129['care_length'].max())

In [None]:
np.percentile(peak1augme129['care_length'], 65)

In [None]:
#time cared GME who Gain/YOLO during the first peak
peak11gain = peak1gain[(peak1gain['created'] > '2021-01-25 00:00:00') & (peak1gain['created'] < '2021-01-30 00:00:00')]['author'].value_counts()
peak11gain = pd.DataFrame(peak11gain)

In [None]:
len(peak11gain)

In [None]:
peak1gaingme = pd.merge(peak11gain,sumpeak1au[['author','last_gme','first_in_wsb']],left_index=True,right_on='author',how='inner')

In [None]:
print(len(peak1gaingme[peak1gaingme['last_gme'] > datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)]))
print(len(peak1gaingme[peak1gaingme['last_gme'] > datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)])/2092)

In [None]:
peak1gaingme129 = peak1gaingme[peak1gaingme['last_gme'] > datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)]
peak1gaingme129['care_length'] = (peak1gaingme129['last_gme'] - datetime(year=2021, month=1, day=29, hour=0, minute=0, second=0)).apply(lambda x: x.days)


In [None]:
peak1gaingme129['care_length'].value_counts().plot.bar(label='number of users')
plt.legend()
plt.show()

In [None]:
print(peak1gaingme129['care_length'].mean())
print(peak1gaingme129['care_length'].median())
print(peak1gaingme129['care_length'].std())
print(peak1gaingme129['care_length'].min())
print(peak1gaingme129['care_length'].max())

In [None]:
peak1yolo = peak1augmepo[peak1augmepo['flair'] == 'YOLO']
peak11yolo = peak1yolo[(peak1yolo['created'] > '2021-01-25 00:00:00') & (peak1yolo['created'] < '2021-01-30 00:00:00')]['author'].value_counts()
peak11yolo = pd.DataFrame(peak11yolo)

In [None]:
len(peak11yolo)

In [None]:
peak1yologme = pd.merge(peak11yolo,sumpeak1au[['author','last_gme','first_in_wsb']],left_index=True,right_on='author',how='inner')

In [None]:
print(len(peak1yologme[peak1yologme['last_gme'] > datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)]))
print(len(peak1yologme[peak1yologme['last_gme'] > datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)])/6636)

In [None]:
peak1yologme129 = peak1yologme[peak1yologme['last_gme'] > datetime(year=2021, month=1, day=29, hour=23, minute=59, second=59)]
peak1yologme129['care_length'] = (peak1yologme129['last_gme'] - datetime(year=2021, month=1, day=29, hour=0, minute=0, second=0)).apply(lambda x: x.days)


In [None]:
print(peak1yologme129['care_length'].mean())
print(peak1yologme129['care_length'].median())
print(peak1yologme129['care_length'].std())
print(peak1yologme129['care_length'].min())
print(peak1yologme129['care_length'].max())

In [None]:
# score,frequency, no_com time relationship (5.31 is last day for collection)
print(sumpeak1au['gmepost_score'].mean())
print(sumpeak1au['gmepost_score'].median())
print(sumpeak1au['gmepost_score'].std())

In [None]:
print(sumpeak1au['No_gmepost_com'].mean())
print(sumpeak1au['No_gmepost_com'].median())
print(sumpeak1au['No_gmepost_com'].std())

In [None]:
print(sumpeak1au['No_gmepost'].mean())
print(sumpeak1au['No_gmepost'].median())
print(sumpeak1au['No_gmepost'].std())

In [None]:
sumpeak1au['No_gmepost'].value_counts().plot()

In [None]:
for i,j in sumpeak1au['No_gmecom'].iteritems():
    if str(sumpeak1au['No_gmecom'][i]) == 'nan':
        sumpeak1au['No_gmecom'][i] = 0

In [None]:
sumpeak1au['No_gme'] = sumpeak1au['No_gmecom'] + sumpeak1au['No_gmepost']

In [None]:
print(sumpeak1au['No_gmecom'].mean())
print(sumpeak1au['No_gmecom'].median())
print(sumpeak1au['No_gmecom'].std())

In [None]:
print(sumpeak1au['No_gme'].mean())
print(sumpeak1au['No_gme'].median())
print(sumpeak1au['No_gme'].std())

In [None]:
sumpeak1au['truecontent_length']=(datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0)-sumpeak1au['first_in_wsb']).apply(lambda x: x.days)

In [None]:
sumpeak1au.to_csv('/Users/elaine/Desktop/sumdata.csv')

In [None]:
import statsmodels.formula.api as sm

In [None]:
result1 = sm.ols(formula='gmecom_score ~ truecontent_length', data=sumpeak1au).fit()
print(result1.summary())

In [None]:
import scipy.stats as stats

In [None]:
stats.pearsonr(sumpeak1au['gmepost_score'],sumpeak1au['No_gme'])

In [None]:
# stay time relationship after 1.29
result2 = sm.ols(formula='gmecom_length ~ truecontent_length', data=sumpeak1au).fit()
print(result2.summary())

In [None]:
peak1augme129['truecontent_length']=(datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0)-peak1augme129['first_in_wsb']).apply(lambda x: x.days)

In [None]:
result3 = sm.ols(formula='care_length ~ truecontent_length', data=peak1augme129).fit()
print(result3.summary())

In [None]:
peak1gaingme129['truecontent_length']=(datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0)-peak1gaingme129['first_in_wsb']).apply(lambda x: x.days)

In [None]:
result4 = sm.ols(formula='care_length ~ truecontent_length', data=peak1gaingme129).fit()
print(result4.summary())

In [None]:
peak1yologme129['truecontent_length']=(datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0)-peak1yologme129['first_in_wsb']).apply(lambda x: x.days)

In [None]:
result5 = sm.ols(formula='care_length ~ truecontent_length', data=peak1yologme129).fit()
print(result5.summary())

In [None]:
#cultural form - meme/YOLO/News
peak1meme = peak1augmepo[peak1augmepo['flair'] == 'Meme']
peak11stmeme = peak1meme.sort_values("created").drop_duplicates("author",keep='first')
peak1memefreq = pd.DataFrame(peak1meme.groupby(['author'])['sub_id'].count())

In [None]:
peak1culmeme = pd.merge(peak1memefreq,sumpeak1au[['author','last_gme','first_in_wsb']],left_on='author',right_on='author',how='inner')
peak1culmeme

In [None]:
peak1culmeme = peak1culmeme.merge(peak11stmeme[['author','created']],left_on='author',right_on='author',how='inner')
peak1culmeme.rename(columns={'created':'fist_gme_meme','sub_id':'No_gme_meme'},inplace=True)

In [None]:
peak1culmeme['content_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - peak1culmeme['first_in_wsb']).apply(lambda x: x.days)
peak1culmeme['meme_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - peak1culmeme['fist_gme_meme']).apply(lambda x: x.days)


In [None]:
len(peak1culmeme)/30013

In [None]:
result6 = sm.ols(formula='meme_length ~ content_length', data=peak1culmeme).fit()
print(result6.summary())

In [None]:
result6 = sm.ols(formula='No_gme_meme ~ content_length', data=peak1culmeme).fit()
print(result6.summary())

In [None]:
peak1yolo = peak1augmepo[peak1augmepo['flair'] == 'YOLO']
peak11styolo = peak1yolo.sort_values("created").drop_duplicates("author",keep='first')
peak1yolofreq = pd.DataFrame(peak1yolo.groupby(['author'])['sub_id'].count())
peak1culyolo = pd.merge(peak1yolofreq,sumpeak1au[['author','last_gme','first_in_wsb']],left_on='author',right_on='author',how='inner')
peak1culyolo = peak1culyolo.merge(peak11styolo[['author','created']],left_on='author',right_on='author',how='inner')
peak1culyolo.rename(columns={'created':'fist_gme_yolo','sub_id':'No_gme_yolo'},inplace=True)
peak1culyolo['content_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - peak1culyolo['first_in_wsb']).apply(lambda x: x.days)
peak1culyolo['yolo_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - peak1culyolo['fist_gme_yolo']).apply(lambda x: x.days)


In [None]:
len(peak1culyolo)/30013

In [None]:
result6 = sm.ols(formula='yolo_length ~ content_length', data=peak1culyolo).fit()
print(result6.summary())

In [None]:
result6 = sm.ols(formula='No_gme_yolo ~ content_length', data=peak1culyolo).fit()
print(result6.summary())

In [None]:
peak1news = peak1augmepo[peak1augmepo['flair'] == 'News']
peak11stnews = peak1news.sort_values("created").drop_duplicates("author",keep='first')
peak1newsfreq = pd.DataFrame(peak1news.groupby(['author'])['sub_id'].count())
peak1culnews = pd.merge(peak1newsfreq,sumpeak1au[['author','last_gme','first_in_wsb']],left_on='author',right_on='author',how='inner')
peak1culnews = peak1culnews.merge(peak11stnews[['author','created']],left_on='author',right_on='author',how='inner')
peak1culnews.rename(columns={'created':'fist_gme_news','sub_id':'No_gme_news'},inplace=True)
peak1culnews['content_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - peak1culnews['first_in_wsb']).apply(lambda x: x.days)
peak1culnews['news_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - peak1culnews['fist_gme_news']).apply(lambda x: x.days)


In [None]:
len(peak1culnews)/30013

In [None]:
result7 = sm.ols(formula='news_length ~ content_length', data=peak1culnews).fit()
print(result7.summary())

In [None]:
result7 = sm.ols(formula='No_gme_news ~ content_length', data=peak1culnews).fit()
print(result7.summary())

In [None]:
peak1ana = peak1augmepo[(peak1augmepo['flair'] == 'DD') | (peak1augmepo['flair'] == 'Technical Analysis')]
peak11stana = peak1ana.sort_values("created").drop_duplicates("author",keep='first')
peak1anafreq = pd.DataFrame(peak1ana.groupby(['author'])['sub_id'].count())
peak1culana = pd.merge(peak1anafreq,sumpeak1au[['author','last_gme','first_in_wsb']],left_on='author',right_on='author',how='inner')
peak1culana = peak1culana.merge(peak11stana[['author','created']],left_on='author',right_on='author',how='inner')
peak1culana.rename(columns={'created':'fist_gme_ana','sub_id':'No_gme_ana'},inplace=True)
peak1culana['content_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - peak1culana['first_in_wsb']).apply(lambda x: x.days)
peak1culana['ana_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - peak1culana['fist_gme_ana']).apply(lambda x: x.days)


In [None]:
len(peak1culana)/30013

In [None]:
result8 = sm.ols(formula='ana_length ~ content_length', data=peak1culana).fit()
print(result8.summary())

In [None]:
result8 = sm.ols(formula='No_gme_ana ~ content_length', data=peak1culana).fit()
print(result8.summary())

In [None]:
peak1dis = peak1augmepo[peak1augmepo['flair'] == 'Discussion']
peak11stdis = peak1dis.sort_values("created").drop_duplicates("author",keep='first')
peak1disfreq = pd.DataFrame(peak1dis.groupby(['author'])['sub_id'].count())
peak1culdis = pd.merge(peak1disfreq,sumpeak1au[['author','last_gme','first_in_wsb']],left_on='author',right_on='author',how='inner')
peak1culdis = peak1culdis.merge(peak11stdis[['author','created']],left_on='author',right_on='author',how='inner')
peak1culdis.rename(columns={'created':'fist_gme_dis','sub_id':'No_gme_dis'},inplace=True)
peak1culdis['content_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - peak1culdis['first_in_wsb']).apply(lambda x: x.days)
peak1culdis['dis_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - peak1culdis['fist_gme_dis']).apply(lambda x: x.days)


In [None]:
len(peak1culdis)/30013

In [None]:
result9 = sm.ols(formula='dis_length ~ content_length', data=peak1culdis).fit()
print(result9.summary())

In [None]:
result9 = sm.ols(formula='No_gme_dis ~ content_length', data=peak1culdis).fit()
print(result9.summary())

In [None]:
# critical discourse
moonpost = peak1auposts[(peak1auposts.body.str.contains('moon|Moon|MOON',case=False)) | (peak1auposts.title.str.contains('moon|Moon|MOON',case=False))]


In [None]:
mooncom = peak1aucomments[(~peak1aucomments.Body.isna()) & (peak1aucomments.Body.str.contains('moon|Moon|MOON',case=False))]

In [None]:
moonpost = moonpost[['author','created','sub_id']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')


In [None]:
mooncom = mooncom[['Author','Publish Date','Comment ID']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='Author',right_on='author',how='inner')
del mooncom['author']

In [None]:
mooncomfreq = pd.DataFrame(mooncom.groupby(['Author'])['Comment ID'].count())
mooncomfreq.rename(columns={'Comment ID':'No_com_moon'},inplace=True)
moonpofreq = pd.DataFrame(moonpost.groupby(['author'])['sub_id'].count())
moonpofreq.rename(columns={'sub_id':'No_po_moon'},inplace=True)

In [None]:
moonplotpo = moonpost[['created','sub_id']]
moonplotpo.rename(columns={'sub_id':'id'},inplace=True)
moonplotcom = mooncom[['Publish Date','Comment ID']]
moonplotcom.rename(columns={'Publish Date':'created','Comment ID':'id'},inplace=True)

In [None]:
moonplot = pd.concat([moonplotpo,moonplotcom])

In [None]:
# how many times moon has been used?
len(moonplot)

In [None]:
tmoonplot = moonplot.set_index('created')
tmoonplot.groupby(pd.Grouper(freq='M'))['id'].count().plot(label="number of pieces used 'moon' by month")
plt.legend()
plt.show()

In [None]:
# get first post moon time
moonpost = moonpost.sort_values("created").drop_duplicates("author",keep='first')
moonpost.rename(columns={'created':'first_po_moon'},inplace=True)
# get first comment moon time
mooncom = mooncom.sort_values("Publish Date").drop_duplicates("Author",keep='first')
mooncom.rename(columns={'Publish Date':'first_com_moon','Author':'author'},inplace=True)

In [None]:
moonpiece = pd.merge(moonpost,mooncom,left_on='author',right_on='author',how='outer')

In [None]:
moonpiece = pd.merge(moonpiece[['author','first_po_moon','first_com_moon']],sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')

In [None]:
len(moonpiece)/30013

In [None]:
moonpiece = moonpiece.merge(mooncomfreq,left_on='author',right_on='Author',how='left')
len(moonpiece)

In [None]:
moonpiece = moonpiece.merge(moonpofreq,left_on='author',right_on='author',how='left')

In [None]:
for i,j in moonpiece['No_com_moon'].iteritems():
    if str(moonpiece['No_com_moon'][i]) == 'nan':
        moonpiece['No_com_moon'][i] = 0
for i,j in moonpiece['No_po_moon'].iteritems():
    if str(moonpiece['No_po_moon'][i]) == 'nan':
        moonpiece['No_po_moon'][i] = 0
moonpiece['No_moon'] = moonpiece['No_po_moon'] + moonpiece['No_com_moon']

In [None]:
moonpiece['first_moon']=''
for i,r in moonpiece.iterrows():
    if str(moonpiece['first_po_moon'][i]) == 'NaT':
        moonpiece['first_moon'][i] = moonpiece['first_com_moon'][i]
    else:
        if str(moonpiece['first_com_moon'][i]) == 'NaT':
            moonpiece['first_moon'][i] = moonpiece['first_po_moon'][i]
        else:
            if moonpiece['first_po_moon'][i] < moonpiece['first_com_moon'][i]:
                moonpiece['first_moon'][i] = moonpiece['first_po_moon'][i]
            else:
                moonpiece['first_moon'][i] = moonpiece['first_com_moon'][i]

In [None]:
tfm = moonpiece.set_index('first_moon')
tfm.groupby(pd.Grouper(freq='W'))['author'].count().plot(label='number of authors by week')
plt.legend()
plt.show()

In [None]:
moonpiece['firstmoon_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - moonpiece['first_moon']).apply(lambda x:x.days)


In [None]:
result10 = sm.ols(formula='firstmoon_length ~ truecontent_length', data=moonpiece).fit()
print(result10.summary())

In [None]:
result10 = sm.ols(formula='No_moon ~ truecontent_length', data=moonpiece).fit()
print(result10.summary())

In [None]:
result10 = sm.ols(formula='gmepost_score ~ firstmoon_length', data=moonpiece).fit()
print(result10.summary())

In [None]:
# use moon before first gme
len(moonpiece[moonpiece['first_gme'] > moonpiece['first_moon']])/8135

In [None]:
# use moon at or after first gme
len(moonpiece[moonpiece['first_gme'] <= moonpiece['first_moon']])

In [None]:
6151/8135

In [None]:
moonaftergme = moonpiece[moonpiece['first_gme'] <= moonpiece['first_moon']]
tmoonaftergme = moonaftergme.set_index('first_in_wsb')
tmoonaftergme.groupby(pd.Grouper(freq='W'))['author'].count().plot(label="number of authors used 'moon' after GME by week")
plt.legend()
plt.show()

In [None]:
tmoonaftergme.groupby(pd.Grouper(freq='W'))['author'].count()

In [None]:
3136/len(moonaftergme)

In [None]:
result10 = sm.ols(formula='firstmoon_length ~ truecontent_length', data=moonaftergme).fit()
print(result10.summary())

In [None]:
# how many people used mooon in GME content
moongmepost = peak1augmepo[(peak1augmepo.body.str.contains('moon|Moon|MOON',case=False)) | (peak1augmepo.title.str.contains('moon|Moon|MOON',case=False))]
moongmecom = peak1augmecom[(~peak1augmecom.Body_x.isna()) & (peak1augmecom.Body_x.str.contains('moon|Moon|MOON',case=False))]
moongmepost = moongmepost[['author','created','sub_id']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
moongmecom = moongmecom[['Author','Publish Date','Comment ID']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='Author',right_on='author',how='inner')
del moongmecom['author']
moongmecomfreq = pd.DataFrame(moongmecom.groupby(['Author'])['Comment ID'].count())
moongmecomfreq.rename(columns={'Comment ID':'No_gmecom_moon'},inplace=True)
moongmepofreq = pd.DataFrame(moongmepost.groupby(['author'])['sub_id'].count())
moongmepofreq.rename(columns={'sub_id':'No_gmepo_moon'},inplace=True)
moongmeplotpo = moongmepost[['created','sub_id']]
moongmeplotpo.rename(columns={'sub_id':'id'},inplace=True)
moongmeplotcom = moongmecom[['Publish Date','Comment ID']]
moongmeplotcom.rename(columns={'Publish Date':'created','Comment ID':'id'},inplace=True)
moongmeplot = pd.concat([moongmeplotpo,moongmeplotcom])
print(len(moongmeplot))

In [None]:
tmoongmeplot = moongmeplot.set_index('created')
tmoongmeplot.groupby(pd.Grouper(freq='M'))['id'].count().plot(label="number of pieces used 'moon' by month")
plt.legend()
plt.show()

In [None]:
# get first post moon in GME time
moongmepost = moongmepost.sort_values("created").drop_duplicates("author",keep='first')
moongmepost.rename(columns={'created':'first_gmepo_moon'},inplace=True)
# get first comment moon in GME time
moongmecom = moongmecom.sort_values("Publish Date").drop_duplicates("Author",keep='first')
moongmecom.rename(columns={'Publish Date':'first_gmecom_moon','Author':'author'},inplace=True)
moongmepiece = pd.merge(moongmepost,moongmecom,left_on='author',right_on='author',how='outer')
moongmepiece = pd.merge(moongmepiece[['author','first_gmepo_moon','first_gmecom_moon']],sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
moongmepiece = moongmepiece.merge(moongmecomfreq,left_on='author',right_on='Author',how='left')
moongmepiece = moongmepiece.merge(moongmepofreq,left_on='author',right_on='author',how='left')

In [None]:
for i,j in moongmepiece['No_gmecom_moon'].iteritems():
    if str(moongmepiece['No_gmecom_moon'][i]) == 'nan':
        moongmepiece['No_gmecom_moon'][i] = 0
for i,j in moongmepiece['No_gmepo_moon'].iteritems():
    if str(moongmepiece['No_gmepo_moon'][i]) == 'nan':
        moongmepiece['No_gmepo_moon'][i] = 0
moongmepiece['No_gmemoon'] = moongmepiece['No_gmepo_moon'] + moongmepiece['No_gmecom_moon']
moongmepiece['first_gmemoon']=''
for i,r in moongmepiece.iterrows():
    if str(moongmepiece['first_gmepo_moon'][i]) == 'NaT':
        moongmepiece['first_gmemoon'][i] = moongmepiece['first_gmecom_moon'][i]
    else:
        if str(moongmepiece['first_gmecom_moon'][i]) == 'NaT':
            moongmepiece['first_gmemoon'][i] = moongmepiece['first_gmepo_moon'][i]
        else:
            if moongmepiece['first_gmepo_moon'][i] < moongmepiece['first_gmecom_moon'][i]:
                moongmepiece['first_gmemoon'][i] = moongmepiece['first_gmepo_moon'][i]
            else:
                moongmepiece['first_gmemoon'][i] = moongmepiece['first_gmecom_moon'][i]

In [None]:
print(len(moongmepiece))
len(moongmepiece)/30013

In [None]:
tfgm = moongmepiece.set_index('first_gmemoon')
tfgm.groupby(pd.Grouper(freq='W'))['author'].count().plot(label='number of authors by week')
plt.legend()
plt.show()

In [None]:
moongmepiece['firstgmemoon_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - moongmepiece['first_gmemoon']).apply(lambda x:x.days)


In [None]:
result11 = sm.ols(formula='firstgmemoon_length ~ truecontent_length', data=moongmepiece).fit()
print(result11.summary())

In [None]:
result11 = sm.ols(formula='No_gmemoon ~ truecontent_length', data=moongmepiece).fit()
print(result11.summary())

In [None]:
result11 = sm.ols(formula='gmepost_score ~ firstgmemoon_length', data=moongmepiece).fit()
print(result11.summary())

In [None]:
# use moon before first gme
len(moongmepiece[moongmepiece['first_gme'] > moongmepiece['first_gmemoon']])/5449

In [None]:
moongmepiece['gme_gmemoon_length'] = (moongmepiece['first_gmemoon'] - moongmepiece['first_gme']).apply(lambda x:x.days)

In [None]:
print(moongmepiece['gme_gmemoon_length'].mean())
print(moongmepiece['gme_gmemoon_length'].median())
print(moongmepiece['gme_gmemoon_length'].std())
print(moongmepiece['gme_gmemoon_length'].min())
print(moongmepiece['gme_gmemoon_length'].max())
np.percentile(moongmepiece['gme_gmemoon_length'], 75)

In [None]:
moongmepiece['gme_gmemoon_length'].value_counts().plot()

In [None]:
tfgma = moongmepiece.set_index('first_in_wsb')
tfgma.groupby(pd.Grouper(freq='W'))['author'].count().plot(label='number of authors by week')
plt.legend()
plt.show()

In [None]:
result11 = sm.ols(formula='gme_gmemoon_length ~ truecontent_length', data=moongmepiece).fit()
print(result11.summary())

In [None]:
# how many people used hold
holdpost = peak1auposts[(peak1auposts.body.str.contains('hold|HOLD|holding|Hold|Holding|HOLDING',case=False)) | (peak1auposts.title.str.contains('hold|HOLD|holding|Hold|Holding|HOLDING',case=False))]
holdcom = peak1aucomments[(~peak1aucomments.Body.isna()) & (peak1aucomments.Body.str.contains('hold|HOLD|holding|Hold|Holding|HOLDING',case=False))]
holdpost = holdpost[['author','created','sub_id']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
holdcom = holdcom[['Author','Publish Date','Comment ID']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='Author',right_on='author',how='inner')
del holdcom['author']
holdcomfreq = pd.DataFrame(holdcom.groupby(['Author'])['Comment ID'].count())
holdcomfreq.rename(columns={'Comment ID':'No_com_hold'},inplace=True)
holdpofreq = pd.DataFrame(holdpost.groupby(['author'])['sub_id'].count())
holdpofreq.rename(columns={'sub_id':'No_po_hold'},inplace=True)
holdplotpo = holdpost[['created','sub_id']]
holdplotpo.rename(columns={'sub_id':'id'},inplace=True)
holdplotcom = holdcom[['Publish Date','Comment ID']]
holdplotcom.rename(columns={'Publish Date':'created','Comment ID':'id'},inplace=True)
holdplot = pd.concat([holdplotpo,holdplotcom])
print(len(holdplot))

In [None]:
tholdplot = holdplot.set_index('created')
tholdplot.groupby(pd.Grouper(freq='M'))['id'].count().plot(label="number of pieces used 'hold' by month")
plt.legend()
plt.show()

In [None]:
# get first post hold time
holdpost = holdpost.sort_values("created").drop_duplicates("author",keep='first')
holdpost.rename(columns={'created':'first_po_hold'},inplace=True)
# get first comment hold time
holdcom = holdcom.sort_values("Publish Date").drop_duplicates("Author",keep='first')
holdcom.rename(columns={'Publish Date':'first_com_hold','Author':'author'},inplace=True)
holdpiece = pd.merge(holdpost,holdcom,left_on='author',right_on='author',how='outer')
holdpiece = pd.merge(holdpiece[['author','first_po_hold','first_com_hold']],sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
holdpiece = holdpiece.merge(holdcomfreq,left_on='author',right_on='Author',how='left')
holdpiece = holdpiece.merge(holdpofreq,left_on='author',right_on='author',how='left')

In [None]:
for i,j in holdpiece['No_com_hold'].iteritems():
    if str(holdpiece['No_com_hold'][i]) == 'nan':
        holdpiece['No_com_hold'][i] = 0
for i,j in holdpiece['No_po_hold'].iteritems():
    if str(holdpiece['No_po_hold'][i]) == 'nan':
        holdpiece['No_po_hold'][i] = 0
holdpiece['No_hold'] = holdpiece['No_po_hold'] + holdpiece['No_com_hold']
holdpiece['first_hold']=''
for i,r in holdpiece.iterrows():
    if str(holdpiece['first_po_hold'][i]) == 'NaT':
        holdpiece['first_hold'][i] = holdpiece['first_com_hold'][i]
    else:
        if str(holdpiece['first_com_hold'][i]) == 'NaT':
            holdpiece['first_hold'][i] = holdpiece['first_po_hold'][i]
        else:
            if holdpiece['first_po_hold'][i] < holdpiece['first_com_hold'][i]:
                holdpiece['first_hold'][i] = holdpiece['first_po_hold'][i]
            else:
                holdpiece['first_hold'][i] = holdpiece['first_com_hold'][i]

In [None]:
print(len(holdpiece))
len(holdpiece)/30013

In [None]:
tfh = holdpiece.set_index('first_hold')
tfh.groupby(pd.Grouper(freq='M'))['author'].count().plot(label='number of authors by month')
plt.legend()
plt.show()

In [None]:
holdpiece['firsthold_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - holdpiece['first_hold']).apply(lambda x:x.days)


In [None]:
result10 = sm.ols(formula='firsthold_length ~ truecontent_length', data=holdpiece).fit()
print(result10.summary())

In [None]:
result11 = sm.ols(formula='No_hold ~ truecontent_length', data=holdpiece).fit()
print(result11.summary())

In [None]:
result11 = sm.ols(formula='gmepost_score ~ firsthold_length', data=holdpiece).fit()
print(result11.summary())

In [None]:
# use hold before first gme
print(len(holdpiece[holdpiece['first_gme'] > holdpiece['first_hold']])/13055)
# use moon at or after first gme
print(len(holdpiece[holdpiece['first_gme'] <= holdpiece['first_hold']]))
print(len(holdpiece[holdpiece['first_gme'] <= holdpiece['first_hold']])/13055)

In [None]:
holdaftergme = holdpiece[holdpiece['first_gme'] <= holdpiece['first_hold']]
tholdaftergme = holdaftergme.set_index('first_in_wsb')
tholdaftergme.groupby(pd.Grouper(freq='W'))['author'].count()

In [None]:
5853/len(holdaftergme)

In [None]:
result10 = sm.ols(formula='firsthold_length ~ truecontent_length', data=holdaftergme).fit()
print(result10.summary())

In [None]:
# how many people used hold in GME content
holdgmepost = peak1augmepo[(peak1augmepo.body.str.contains('hold|HOLD|holding|Hold|Holding|HOLDING',case=False)) | (peak1augmepo.title.str.contains('hold|HOLD|holding|Hold|Holding|HOLDING',case=False))]
holdgmecom = peak1augmecom[(~peak1augmecom.Body_x.isna()) & (peak1augmecom.Body_x.str.contains('hold|HOLD|holding|Hold|Holding|HOLDING',case=False))]
holdgmepost = holdgmepost[['author','created','sub_id']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
holdgmecom = holdgmecom[['Author','Publish Date','Comment ID']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='Author',right_on='author',how='inner')
del holdgmecom['author']
holdgmecomfreq = pd.DataFrame(holdgmecom.groupby(['Author'])['Comment ID'].count())
holdgmecomfreq.rename(columns={'Comment ID':'No_gmecom_hold'},inplace=True)
holdgmepofreq = pd.DataFrame(holdgmepost.groupby(['author'])['sub_id'].count())
holdgmepofreq.rename(columns={'sub_id':'No_gmepo_hold'},inplace=True)
holdgmeplotpo = holdgmepost[['created','sub_id']]
holdgmeplotpo.rename(columns={'sub_id':'id'},inplace=True)
holdgmeplotcom = holdgmecom[['Publish Date','Comment ID']]
holdgmeplotcom.rename(columns={'Publish Date':'created','Comment ID':'id'},inplace=True)
holdgmeplot = pd.concat([holdgmeplotpo,holdgmeplotcom])
print(len(holdgmeplot))

In [None]:
tholdgmeplot = holdgmeplot.set_index('created')
tholdgmeplot.groupby(pd.Grouper(freq='M'))['id'].count().plot(label="number of pieces used 'hold' by month")
plt.legend()
plt.show()

In [None]:
# get first post hold in GME time
holdgmepost = holdgmepost.sort_values("created").drop_duplicates("author",keep='first')
holdgmepost.rename(columns={'created':'first_gmepo_hold'},inplace=True)
# get first comment hold in GME time
holdgmecom = holdgmecom.sort_values("Publish Date").drop_duplicates("Author",keep='first')
holdgmecom.rename(columns={'Publish Date':'first_gmecom_hold','Author':'author'},inplace=True)
holdgmepiece = pd.merge(holdgmepost,holdgmecom,left_on='author',right_on='author',how='outer')
holdgmepiece = pd.merge(holdgmepiece[['author','first_gmepo_hold','first_gmecom_hold']],sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
holdgmepiece = holdgmepiece.merge(holdgmecomfreq,left_on='author',right_on='Author',how='left')
holdgmepiece = holdgmepiece.merge(holdgmepofreq,left_on='author',right_on='author',how='left')

for i,j in holdgmepiece['No_gmecom_hold'].iteritems():
    if str(holdgmepiece['No_gmecom_hold'][i]) == 'nan':
        holdgmepiece['No_gmecom_hold'][i] = 0
for i,j in holdgmepiece['No_gmepo_hold'].iteritems():
    if str(holdgmepiece['No_gmepo_hold'][i]) == 'nan':
        holdgmepiece['No_gmepo_hold'][i] = 0
holdgmepiece['No_gmehold'] = holdgmepiece['No_gmepo_hold'] + holdgmepiece['No_gmecom_hold']
holdgmepiece['first_gmehold']=''
for i,r in holdgmepiece.iterrows():
    if str(holdgmepiece['first_gmepo_hold'][i]) == 'NaT':
        holdgmepiece['first_gmehold'][i] = holdgmepiece['first_gmecom_hold'][i]
    else:
        if str(holdgmepiece['first_gmecom_hold'][i]) == 'NaT':
            holdgmepiece['first_gmehold'][i] = holdgmepiece['first_gmepo_hold'][i]
        else:
            if holdgmepiece['first_gmepo_hold'][i] < holdgmepiece['first_gmecom_hold'][i]:
                holdgmepiece['first_gmehold'][i] = holdgmepiece['first_gmepo_hold'][i]
            else:
                holdgmepiece['first_gmehold'][i] = holdgmepiece['first_gmecom_hold'][i]

In [None]:
print(len(holdgmepiece))
len(holdgmepiece)/30013

In [None]:
tfgh = holdgmepiece.set_index('first_gmehold')
tfgh.groupby(pd.Grouper(freq='W'))['author'].count().plot(label='number of authors by week')
plt.legend()
plt.show()

In [None]:
holdgmepiece['firstgmehold_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - holdgmepiece['first_gmehold']).apply(lambda x:x.days)


In [None]:
result11 = sm.ols(formula='firstgmehold_length ~ truecontent_length', data=holdgmepiece).fit()
print(result11.summary())

In [None]:
result11 = sm.ols(formula='No_gmehold ~ truecontent_length', data=holdgmepiece).fit()
print(result11.summary())

In [None]:
result11 = sm.ols(formula='gmepost_score ~ firstgmehold_length', data=holdgmepiece).fit()
print(result11.summary())

In [None]:
# use hold before first gme
len(holdgmepiece[holdgmepiece['first_gme'] > holdgmepiece['first_gmehold']])/9969

In [None]:
holdgmepiece['gme_gmehold_length'] = (holdgmepiece['first_gmehold'] - holdgmepiece['first_gme']).apply(lambda x:x.days)

In [None]:
print(holdgmepiece['gme_gmehold_length'].mean())
print(holdgmepiece['gme_gmehold_length'].median())
print(holdgmepiece['gme_gmehold_length'].std())
print(holdgmepiece['gme_gmehold_length'].min())
print(holdgmepiece['gme_gmehold_length'].max())
np.percentile(holdgmepiece['gme_gmehold_length'], 75)

In [None]:
result11 = sm.ols(formula='gme_gmehold_length ~ truecontent_length', data=holdgmepiece).fit()
print(result11.summary())

In [None]:
# how many people used like the stock
likepost = peak1auposts[(peak1auposts.body.str.contains('like the stock|Like The Stock|LIKE THE STOCK',case=False)) | (peak1auposts.title.str.contains('like the stock|Like The Stock|LIKE THE STOCK',case=False))]
likecom = peak1aucomments[(~peak1aucomments.Body.isna()) & (peak1aucomments.Body.str.contains('like the stock|Like The Stock|LIKE THE STOCK',case=False))]
likepost = likepost[['author','created','sub_id']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
likecom = likecom[['Author','Publish Date','Comment ID']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='Author',right_on='author',how='inner')
del likecom['author']
likecomfreq = pd.DataFrame(likecom.groupby(['Author'])['Comment ID'].count())
likecomfreq.rename(columns={'Comment ID':'No_com_like'},inplace=True)
likepofreq = pd.DataFrame(likepost.groupby(['author'])['sub_id'].count())
likepofreq.rename(columns={'sub_id':'No_po_like'},inplace=True)
likeplotpo = likepost[['created','sub_id']]
likeplotpo.rename(columns={'sub_id':'id'},inplace=True)
likeplotcom = likecom[['Publish Date','Comment ID']]
likeplotcom.rename(columns={'Publish Date':'created','Comment ID':'id'},inplace=True)
likeplot = pd.concat([likeplotpo,likeplotcom])
print(len(likeplot))

In [None]:
tlikeplot = likeplot.set_index('created')
tlikeplot.groupby(pd.Grouper(freq='M'))['id'].count().plot(label="number of pieces used 'like the stock' by month")
plt.legend()
plt.show()

In [None]:
# get first post like the stock time
likepost = likepost.sort_values("created").drop_duplicates("author",keep='first')
likepost.rename(columns={'created':'first_po_like'},inplace=True)
# get first comment like the stock time
likecom = likecom.sort_values("Publish Date").drop_duplicates("Author",keep='first')
likecom.rename(columns={'Publish Date':'first_com_like','Author':'author'},inplace=True)
likepiece = pd.merge(likepost,likecom,left_on='author',right_on='author',how='outer')
likepiece = pd.merge(likepiece[['author','first_po_like','first_com_like']],sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
likepiece = likepiece.merge(likecomfreq,left_on='author',right_on='Author',how='left')
likepiece = likepiece.merge(likepofreq,left_on='author',right_on='author',how='left')

for i,j in likepiece['No_com_like'].iteritems():
    if str(likepiece['No_com_like'][i]) == 'nan':
        likepiece['No_com_like'][i] = 0
for i,j in likepiece['No_po_like'].iteritems():
    if str(likepiece['No_po_like'][i]) == 'nan':
        likepiece['No_po_like'][i] = 0
likepiece['No_like'] = likepiece['No_po_like'] + likepiece['No_com_like']
likepiece['first_like']=''
for i,r in likepiece.iterrows():
    if str(likepiece['first_po_like'][i]) == 'NaT':
        likepiece['first_like'][i] = likepiece['first_com_like'][i]
    else:
        if str(likepiece['first_com_like'][i]) == 'NaT':
            likepiece['first_like'][i] = likepiece['first_po_like'][i]
        else:
            if likepiece['first_po_like'][i] < likepiece['first_com_like'][i]:
                likepiece['first_like'][i] = likepiece['first_po_like'][i]
            else:
                likepiece['first_like'][i] = likepiece['first_com_like'][i]

In [None]:
print(len(likepiece))
len(likepiece)/30013

In [None]:
tfl = likepiece.set_index('first_like')
tfl.groupby(pd.Grouper(freq='M'))['author'].count().plot(label='number of authors by month')
plt.legend()
plt.show()

In [None]:
likepiece['firstlike_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - likepiece['first_like']).apply(lambda x:x.days)


In [None]:
result10 = sm.ols(formula='firstlike_length ~ truecontent_length', data=likepiece).fit()
print(result10.summary())

In [None]:
result11 = sm.ols(formula='No_like ~ truecontent_length', data=likepiece).fit()
print(result11.summary())

In [None]:
result11 = sm.ols(formula='gmepost_score ~ firstlike_length', data=likepiece).fit()
print(result11.summary())

In [None]:
# use like the stock before first gme
print(len(likepiece[likepiece['first_gme'] > likepiece['first_like']])/2869)
# use like the stock at or after first gme
print(len(likepiece[likepiece['first_gme'] <= likepiece['first_like']]))
print(len(likepiece[likepiece['first_gme'] <= likepiece['first_like']])/2869)

In [None]:
likeaftergme = likepiece[likepiece['first_gme'] <= likepiece['first_like']]
tlikeaftergme = likeaftergme.set_index('first_in_wsb')
tlikeaftergme.groupby(pd.Grouper(freq='W'))['author'].count()

In [None]:
1078/len(likeaftergme)

In [None]:
result10 = sm.ols(formula='firstlike_length ~ truecontent_length', data=likeaftergme).fit()
print(result10.summary())

In [None]:
# how many people used like the stock in GME content
likegmepost = peak1augmepo[(peak1augmepo.body.str.contains('like the stock|Like The Stock|LIKE THE STOCK',case=False)) | (peak1augmepo.title.str.contains('like the stock|Like The Stock|LIKE THE STOCK',case=False))]
likegmecom = peak1augmecom[(~peak1augmecom.Body_x.isna()) & (peak1augmecom.Body_x.str.contains('like the stock|Like The Stock|LIKE THE STOCK',case=False))]
likegmepost = likegmepost[['author','created','sub_id']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
likegmecom = likegmecom[['Author','Publish Date','Comment ID']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='Author',right_on='author',how='inner')
del likegmecom['author']
likegmecomfreq = pd.DataFrame(likegmecom.groupby(['Author'])['Comment ID'].count())
likegmecomfreq.rename(columns={'Comment ID':'No_gmecom_like'},inplace=True)
likegmepofreq = pd.DataFrame(likegmepost.groupby(['author'])['sub_id'].count())
likegmepofreq.rename(columns={'sub_id':'No_gmepo_like'},inplace=True)
likegmeplotpo = likegmepost[['created','sub_id']]
likegmeplotpo.rename(columns={'sub_id':'id'},inplace=True)
likegmeplotcom = likegmecom[['Publish Date','Comment ID']]
likegmeplotcom.rename(columns={'Publish Date':'created','Comment ID':'id'},inplace=True)
likegmeplot = pd.concat([likegmeplotpo,likegmeplotcom])
print(len(likegmeplot))

In [None]:
tlikegmeplot = likegmeplot.set_index('created')
tlikegmeplot.groupby(pd.Grouper(freq='M'))['id'].count().plot(label="number of pieces used 'like the stock' by month")
plt.legend()
plt.show()

In [None]:
# get first post like the stock in GME time
likegmepost = likegmepost.sort_values("created").drop_duplicates("author",keep='first')
likegmepost.rename(columns={'created':'first_gmepo_like'},inplace=True)
# get first comment like the stock in GME time
likegmecom = likegmecom.sort_values("Publish Date").drop_duplicates("Author",keep='first')
likegmecom.rename(columns={'Publish Date':'first_gmecom_like','Author':'author'},inplace=True)
likegmepiece = pd.merge(likegmepost,likegmecom,left_on='author',right_on='author',how='outer')
likegmepiece = pd.merge(likegmepiece[['author','first_gmepo_like','first_gmecom_like']],sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
likegmepiece = likegmepiece.merge(likegmecomfreq,left_on='author',right_on='Author',how='left')
likegmepiece = likegmepiece.merge(likegmepofreq,left_on='author',right_on='author',how='left')

for i,j in likegmepiece['No_gmecom_like'].iteritems():
    if str(likegmepiece['No_gmecom_like'][i]) == 'nan':
        likegmepiece['No_gmecom_like'][i] = 0
for i,j in likegmepiece['No_gmepo_like'].iteritems():
    if str(likegmepiece['No_gmepo_like'][i]) == 'nan':
        likegmepiece['No_gmepo_like'][i] = 0
likegmepiece['No_gmelike'] = likegmepiece['No_gmepo_like'] + likegmepiece['No_gmecom_like']
likegmepiece['first_gmelike']=''
for i,r in likegmepiece.iterrows():
    if str(likegmepiece['first_gmepo_like'][i]) == 'NaT':
        likegmepiece['first_gmelike'][i] = likegmepiece['first_gmecom_like'][i]
    else:
        if str(likegmepiece['first_gmecom_like'][i]) == 'NaT':
            likegmepiece['first_gmelike'][i] = likegmepiece['first_gmepo_like'][i]
        else:
            if likegmepiece['first_gmepo_like'][i] < likegmepiece['first_gmecom_like'][i]:
                likegmepiece['first_gmelike'][i] = likegmepiece['first_gmepo_like'][i]
            else:
                likegmepiece['first_gmelike'][i] = likegmepiece['first_gmecom_like'][i]

In [None]:
print(len(likegmepiece))
len(likegmepiece)/30013

In [None]:
tfgl = likegmepiece.set_index('first_gmelike')
tfgl.groupby(pd.Grouper(freq='W'))['author'].count().plot(label='number of authors by week')
plt.legend()
plt.show()

In [None]:
likegmepiece['firstgmelike_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - likegmepiece['first_gmelike']).apply(lambda x:x.days)


In [None]:
result11 = sm.ols(formula='firstgmelike_length ~ truecontent_length', data=likegmepiece).fit()
print(result11.summary())

In [None]:
result11 = sm.ols(formula='No_gmelike ~ truecontent_length', data=likegmepiece).fit()
print(result11.summary())

In [None]:
result11 = sm.ols(formula='gmepost_score ~ firstgmelike_length', data=likegmepiece).fit()
print(result11.summary())

In [None]:
# use like the stock before first gme
len(likegmepiece[likegmepiece['first_gme'] > likegmepiece['first_gmelike']])/1981

In [None]:
likegmepiece['gme_gmelike_length'] = (likegmepiece['first_gmelike'] - likegmepiece['first_gme']).apply(lambda x:x.days)

In [None]:
print(likegmepiece['gme_gmelike_length'].mean())
print(likegmepiece['gme_gmelike_length'].median())
print(likegmepiece['gme_gmelike_length'].std())
print(likegmepiece['gme_gmelike_length'].min())
print(likegmepiece['gme_gmelike_length'].max())
np.percentile(likegmepiece['gme_gmelike_length'], 75)

In [None]:
result11 = sm.ols(formula='gme_gmelike_length ~ truecontent_length', data=likegmepiece).fit()
print(result11.summary())

In [None]:
# how many people used retard
retardpost = peak1auposts[(peak1auposts.body.str.contains('retard|retards|retarded|Retard|RETARD|RETARDS|Retards|RETARDED',case=False)) | (peak1auposts.title.str.contains('retard|retards|retarded|Retard|RETARD|RETARDS|Retards|RETARDED',case=False))]
retardcom = peak1aucomments[(~peak1aucomments.Body.isna()) & (peak1aucomments.Body.str.contains('retard|retards|retarded|Retard|RETARD|RETARDS|Retards|RETARDED',case=False))]
retardpost = retardpost[['author','created','sub_id']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
retardcom = retardcom[['Author','Publish Date','Comment ID']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='Author',right_on='author',how='inner')
del retardcom['author']
retardcomfreq = pd.DataFrame(retardcom.groupby(['Author'])['Comment ID'].count())
retardcomfreq.rename(columns={'Comment ID':'No_com_retard'},inplace=True)
retardpofreq = pd.DataFrame(retardpost.groupby(['author'])['sub_id'].count())
retardpofreq.rename(columns={'sub_id':'No_po_retard'},inplace=True)
retardplotpo = retardpost[['created','sub_id']]
retardplotpo.rename(columns={'sub_id':'id'},inplace=True)
retardplotcom = retardcom[['Publish Date','Comment ID']]
retardplotcom.rename(columns={'Publish Date':'created','Comment ID':'id'},inplace=True)
retardplot = pd.concat([retardplotpo,retardplotcom])
print(len(retardplot))

In [None]:
tretardplot = retardplot.set_index('created')
tretardplot.groupby(pd.Grouper(freq='M'))['id'].count().plot(label="number of pieces used 'retard the stock' by month")
plt.legend()
plt.show()

In [None]:
# get first post retard the stock time
retardpost = retardpost.sort_values("created").drop_duplicates("author",keep='first')
retardpost.rename(columns={'created':'first_po_retard'},inplace=True)
# get first comment retard the stock time
retardcom = retardcom.sort_values("Publish Date").drop_duplicates("Author",keep='first')
retardcom.rename(columns={'Publish Date':'first_com_retard','Author':'author'},inplace=True)
retardpiece = pd.merge(retardpost,retardcom,left_on='author',right_on='author',how='outer')
retardpiece = pd.merge(retardpiece[['author','first_po_retard','first_com_retard']],sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
retardpiece = retardpiece.merge(retardcomfreq,left_on='author',right_on='Author',how='left')
retardpiece = retardpiece.merge(retardpofreq,left_on='author',right_on='author',how='left')

for i,j in retardpiece['No_com_retard'].iteritems():
    if str(retardpiece['No_com_retard'][i]) == 'nan':
        retardpiece['No_com_retard'][i] = 0
for i,j in retardpiece['No_po_retard'].iteritems():
    if str(retardpiece['No_po_retard'][i]) == 'nan':
        retardpiece['No_po_retard'][i] = 0
retardpiece['No_retard'] = retardpiece['No_po_retard'] + retardpiece['No_com_retard']
retardpiece['first_retard']=''
for i,r in retardpiece.iterrows():
    if str(retardpiece['first_po_retard'][i]) == 'NaT':
        retardpiece['first_retard'][i] = retardpiece['first_com_retard'][i]
    else:
        if str(retardpiece['first_com_retard'][i]) == 'NaT':
            retardpiece['first_retard'][i] = retardpiece['first_po_retard'][i]
        else:
            if retardpiece['first_po_retard'][i] < retardpiece['first_com_retard'][i]:
                retardpiece['first_retard'][i] = retardpiece['first_po_retard'][i]
            else:
                retardpiece['first_retard'][i] = retardpiece['first_com_retard'][i]

In [None]:
print(len(retardpiece))
len(retardpiece)/30013

In [None]:
tfr = retardpiece.set_index('first_retard')
tfr.groupby(pd.Grouper(freq='M'))['author'].count().plot(label='number of authors by month')
plt.legend()
plt.show()

In [None]:
retardpiece['firstretard_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - retardpiece['first_retard']).apply(lambda x:x.days)


In [None]:
result10 = sm.ols(formula='firstretard_length ~ truecontent_length', data=retardpiece).fit()
print(result10.summary())

In [None]:
result11 = sm.ols(formula='No_retard ~ truecontent_length', data=retardpiece).fit()
print(result11.summary())

In [None]:
result11 = sm.ols(formula='gmepost_score ~ firstretard_length', data=retardpiece).fit()
print(result11.summary())

In [None]:
# use retard before first gme
print(len(retardpiece[retardpiece['first_gme'] > retardpiece['first_retard']])/7975)
# use retard at or after first gme
print(len(retardpiece[retardpiece['first_gme'] <= retardpiece['first_retard']]))
print(len(retardpiece[retardpiece['first_gme'] <= retardpiece['first_retard']])/7975)

In [None]:
retardaftergme = retardpiece[retardpiece['first_gme'] <= retardpiece['first_retard']]
tretardaftergme = retardaftergme.set_index('first_in_wsb')
tretardaftergme.groupby(pd.Grouper(freq='W'))['author'].count()

In [None]:
2663/len(retardaftergme)

In [None]:
result10 = sm.ols(formula='firstretard_length ~ truecontent_length', data=retardaftergme).fit()
print(result10.summary())

In [None]:
# how many people used retard in GME content
retardgmepost = peak1augmepo[(peak1augmepo.body.str.contains('retard|retards|retarded|Retard|RETARD|RETARDS|Retards|RETARDED',case=False)) | (peak1augmepo.title.str.contains('retard|retards|retarded|Retard|RETARD|RETARDS|Retards|RETARDED',case=False))]
retardgmecom = peak1augmecom[(~peak1augmecom.Body_x.isna()) & (peak1augmecom.Body_x.str.contains('retard|retards|retarded|Retard|RETARD|RETARDS|Retards|RETARDED',case=False))]
retardgmepost = retardgmepost[['author','created','sub_id']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
retardgmecom = retardgmecom[['Author','Publish Date','Comment ID']].merge(sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='Author',right_on='author',how='inner')
del retardgmecom['author']
retardgmecomfreq = pd.DataFrame(retardgmecom.groupby(['Author'])['Comment ID'].count())
retardgmecomfreq.rename(columns={'Comment ID':'No_gmecom_retard'},inplace=True)
retardgmepofreq = pd.DataFrame(retardgmepost.groupby(['author'])['sub_id'].count())
retardgmepofreq.rename(columns={'sub_id':'No_gmepo_retard'},inplace=True)
retardgmeplotpo = retardgmepost[['created','sub_id']]
retardgmeplotpo.rename(columns={'sub_id':'id'},inplace=True)
retardgmeplotcom = retardgmecom[['Publish Date','Comment ID']]
retardgmeplotcom.rename(columns={'Publish Date':'created','Comment ID':'id'},inplace=True)
retardgmeplot = pd.concat([retardgmeplotpo,retardgmeplotcom])
print(len(retardgmeplot))

In [None]:
tretardgmeplot = retardgmeplot.set_index('created')
tretardgmeplot.groupby(pd.Grouper(freq='M'))['id'].count().plot(label="number of pieces used 'retard' by month")
plt.legend()
plt.show()

In [None]:
# get first post retard in GME time
retardgmepost = retardgmepost.sort_values("created").drop_duplicates("author",keep='first')
retardgmepost.rename(columns={'created':'first_gmepo_retard'},inplace=True)
# get first comment retard in GME time
retardgmecom = retardgmecom.sort_values("Publish Date").drop_duplicates("Author",keep='first')
retardgmecom.rename(columns={'Publish Date':'first_gmecom_retard','Author':'author'},inplace=True)
retardgmepiece = pd.merge(retardgmepost,retardgmecom,left_on='author',right_on='author',how='outer')
retardgmepiece = pd.merge(retardgmepiece[['author','first_gmepo_retard','first_gmecom_retard']],sumpeak1au[['author','first_in_wsb','first_gme','last_gme','gmepost_score','gmecom_score','truecontent_length','gme_length','firstgme_length','No_gme']]
                         ,left_on='author',right_on='author',how='inner')
retardgmepiece = retardgmepiece.merge(retardgmecomfreq,left_on='author',right_on='Author',how='left')
retardgmepiece = retardgmepiece.merge(retardgmepofreq,left_on='author',right_on='author',how='left')

for i,j in retardgmepiece['No_gmecom_retard'].iteritems():
    if str(retardgmepiece['No_gmecom_retard'][i]) == 'nan':
        retardgmepiece['No_gmecom_retard'][i] = 0
for i,j in retardgmepiece['No_gmepo_retard'].iteritems():
    if str(retardgmepiece['No_gmepo_retard'][i]) == 'nan':
        retardgmepiece['No_gmepo_retard'][i] = 0
retardgmepiece['No_gmeretard'] = retardgmepiece['No_gmepo_retard'] + retardgmepiece['No_gmecom_retard']
retardgmepiece['first_gmeretard']=''
for i,r in retardgmepiece.iterrows():
    if str(retardgmepiece['first_gmepo_retard'][i]) == 'NaT':
        retardgmepiece['first_gmeretard'][i] = retardgmepiece['first_gmecom_retard'][i]
    else:
        if str(retardgmepiece['first_gmecom_retard'][i]) == 'NaT':
            retardgmepiece['first_gmeretard'][i] = retardgmepiece['first_gmepo_retard'][i]
        else:
            if retardgmepiece['first_gmepo_retard'][i] < retardgmepiece['first_gmecom_retard'][i]:
                retardgmepiece['first_gmeretard'][i] = retardgmepiece['first_gmepo_retard'][i]
            else:
                retardgmepiece['first_gmeretard'][i] = retardgmepiece['first_gmecom_retard'][i]

In [None]:
print(len(retardgmepiece))
len(retardgmepiece)/30013

In [None]:
tfgr = retardgmepiece.set_index('first_gmeretard')
tfgr.groupby(pd.Grouper(freq='W'))['author'].count().plot(label='number of authors by week')
plt.legend()
plt.show()

In [None]:
retardgmepiece['firstgmeretard_length'] = (datetime(year=2021, month=6, day=1, hour=0, minute=0, second=0) - retardgmepiece['first_gmeretard']).apply(lambda x:x.days)


In [None]:
result11 = sm.ols(formula='firstgmeretard_length ~ truecontent_length', data=retardgmepiece).fit()
print(result11.summary())

In [None]:
result11 = sm.ols(formula='No_gmeretard ~ truecontent_length', data=retardgmepiece).fit()
print(result11.summary())

In [None]:
result11 = sm.ols(formula='gmepost_score ~ firstgmeretard_length', data=retardgmepiece).fit()
print(result11.summary())

In [None]:
# use retard before first gme
len(retardgmepiece[retardgmepiece['first_gme'] > retardgmepiece['first_gmeretard']])/5383

In [None]:
retardgmepiece['gme_gmeretard_length'] = (retardgmepiece['first_gmeretard'] - retardgmepiece['first_gme']).apply(lambda x:x.days)

In [None]:
print(retardgmepiece['gme_gmeretard_length'].mean())
print(retardgmepiece['gme_gmeretard_length'].median())
print(retardgmepiece['gme_gmeretard_length'].std())
print(retardgmepiece['gme_gmeretard_length'].min())
print(retardgmepiece['gme_gmeretard_length'].max())
np.percentile(retardgmepiece['gme_gmeretard_length'], 75)

In [None]:
result11 = sm.ols(formula='gme_gmeretard_length ~ truecontent_length', data=retardgmepiece).fit()
print(result11.summary())

In [None]:
#sumpeak1au.loc[sumpeak1au["Author"].isna(),"Author"] = sumpeak1au["author"]

In [None]:
#sumpeak1au.to_csv('/Users/elaine/Desktop/poscore.csv')

In [None]:
sumpeak1aucomgme

In [None]:
peak1ausamcom['Publish Date'].min()

In [None]:
peak1ausamcom['Publish Date'].max()

In [None]:
sumpeak1au["first_posted_in_wsb"].max()

In [None]:
comscoredf = pd.DataFrame(peak1ausamcom.groupby(['Author'])['Score'].mean())
sumpeak1au = sumpeak1au.merge(comscoredf,left_on='author',right_on='Author',how='outer')

In [None]:
sumpeak1au["score"].mean()

In [None]:
sumpeak1au["score"].median()

In [None]:
import statsmodels.formula.api as sm

In [None]:
result1 = sm.ols(formula='score ~ posted_length', data=sumpeak1au).fit()
print(result1.summary())