In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import sklearn
import lzma
import nltk

from pandas.io.json import json_normalize
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

<h1>Presentation Data</h1>
<p>Hello there! For those of you bothering to look at this, I'm just generating some data/graphs for my presentation, so not much markdown like I normally do, though I will state what they are for.</p>

In [12]:
targetsubreddits=["relationships", "aww", "nfl", "PrequelMemes", "gaming", "mildlyinteresting", "politics", "Showerthoughts","worldnews","gifs", "StarWars", "funny"]

In [2]:
redditframe = pd.read_pickle("../data/TokenizedRedditframe.pkl")

<p>The subreddits with the most negative comments.</p>

In [3]:
redditframe[redditframe.score < 0].subreddit.value_counts()

politics               4723
AskReddit              4509
worldnews              3250
news                   2709
nba                    2238
StarWars               1365
videos                 1180
nfl                    1130
movies                 1110
leagueoflegends        1045
PUBATTLEGROUNDS        1015
soccer                 1014
DestinyTheGame          993
europe                  973
pics                    795
gaming                  794
Bitcoin                 774
conspiracy              761
survivor                757
todayilearned           735
SquaredCircle           706
funny                   660
PoliticalHumor          651
DotA2                   629
Showerthoughts          613
canada                  554
CryptoCurrency          553
technology              498
Games                   469
australia               429
                       ... 
see                       1
mtgbracket                1
Basketball                1
wicked_edge               1
DarK                

<p>Subreddits with the most positive comments</p>

In [4]:
redditframe[redditframe.score > 0].subreddit.value_counts()

AskReddit               176476
politics                 49591
The_Donald               33080
nba                      30907
news                     26479
worldnews                24598
StarWars                 22675
survivor                 20599
Bitcoin                  19473
nfl                      17656
movies                   15498
DestinyTheGame           15290
gaming                   14506
PUBATTLEGROUNDS          14499
pics                     14410
videos                   14051
funny                    13947
Showerthoughts           13369
leagueoflegends          13241
CryptoCurrency           13061
todayilearned            12078
CFB                      11539
fantasyfootball          11252
teenagers                10987
RocketLeagueExchange     10095
soccer                   10034
SquaredCircle             9863
btc                       9436
DBZDokkanBattle           9004
FIFA                      8824
                         ...  
skizzymars                   1
ZenGMFoo

<p>Most neutral comments</p>

In [5]:
redditframe[redditframe.score == 0].subreddit.value_counts()

AskReddit             5342
politics              2800
worldnews             2251
news                  2176
StarWars              1282
nba                   1214
Bitcoin               1180
leagueoflegends       1091
DestinyTheGame        1091
PUBATTLEGROUNDS        997
movies                 893
videos                 870
Showerthoughts         806
survivor               746
nfl                    735
todayilearned          661
gaming                 640
pics                   633
funny                  609
conspiracy             606
CryptoCurrency         603
btc                    504
technology             498
DotA2                  486
europe                 476
soccer                 466
SquaredCircle          459
The_Donald             454
CringeAnarchy          433
gifs                   418
                      ... 
naturaltitties           1
ZeroWaste                1
deerhunting              1
CatholicMemes            1
EnglishLearning          1
chillmusic               1
S

In [6]:
redditframe.groupby("subreddit").score.mean()

subreddit
0Wtf_amIdoingHere     1.000000
0ad                   2.000000
0x7B1DEA01            1.000000
0xProject             2.676923
1000more              1.000000
1000words             0.500000
1001Movies            1.000000
100movies365days      2.235294
100sexiest            1.500000
100thieves            1.000000
100yearsago           2.500000
101Wicca              2.500000
1022                  1.666667
10cloverfieldlane     2.666667
10mm                  1.000000
10years               1.000000
117thOSINT            1.000000
1200isjerky          12.666667
1200isplenty          9.996479
125R                  1.000000
12Monkeys             3.200000
12in12                1.571429
12thMan               5.750000
1337Foundation        1.000000
13ReasonsWhy          4.333333
13or30               15.600000
13thage               2.941176
14ers                 5.000000
1500isplenty         17.142857
15SecondStories       1.615385
                       ...    
zec                   1.66129

<p>Can't believe I never did this, but subreddit distribution.</p>

In [7]:
redditframe.subreddit.value_counts()

AskReddit               186327
politics                 57114
nba                      34359
The_Donald               33815
news                     31364
worldnews                30099
StarWars                 25322
survivor                 22102
Bitcoin                  21427
nfl                      19521
movies                   17501
DestinyTheGame           17374
PUBATTLEGROUNDS          16511
videos                   16101
gaming                   15940
pics                     15838
leagueoflegends          15377
funny                    15216
Showerthoughts           14788
CryptoCurrency           14217
todayilearned            13474
CFB                      12190
fantasyfootball          11753
soccer                   11514
teenagers                11252
SquaredCircle            11028
RocketLeagueExchange     10505
btc                      10340
DBZDokkanBattle           9608
europe                    9526
                         ...  
GeologyFans                  1
PokemonG

In [13]:
above50 = redditframe[redditframe.score >= 50]
above50t = above50[above50.subreddit.isin(targetsubreddits)]

In [36]:
#plt.figure(figsize=(20,20))
#fig, ax = plt.subplots(figsize=(20,20))
above50t.groupby("subreddit").score.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
PrequelMemes,130.0,150.469231,163.691437,50.0,63.25,90.0,149.5,1164.0
Showerthoughts,479.0,400.661795,1040.023066,50.0,73.5,131.0,277.0,13328.0
StarWars,851.0,149.481786,222.027572,50.0,64.0,90.0,156.0,3692.0
aww,362.0,376.994475,1135.255738,50.0,69.0,110.0,247.0,16122.0
funny,580.0,450.589655,1222.346296,50.0,72.0,117.5,270.25,11239.0
gaming,691.0,419.299566,1256.65268,50.0,70.0,119.0,275.5,22738.0
gifs,324.0,757.317901,2844.763898,50.0,72.75,122.0,289.25,28613.0
mildlyinteresting,206.0,492.194175,1340.74211,50.0,71.0,115.5,356.0,14349.0
nfl,1019.0,175.197252,352.575079,50.0,65.0,91.0,164.0,8178.0
politics,1786.0,228.754759,498.984808,50.0,65.0,101.0,192.0,7727.0


In [33]:
pd.set_option('display.max_colwidth', -1)
above50t[above50t.score > 25000].permalink

2679664    /r/gifs/comments/7ldc7d/its_all_downhill_from_here/drle9vt/
Name: permalink, dtype: object

In [34]:
above50t[above50t.score > 25000].body

2679664    Well at least he is on the right side to go back up and give it a second try. 
Name: body, dtype: object

In [37]:
file = open("../data/RC_2017-12-21")

In [39]:
text = file.readline()

In [42]:
file.close()