## Imports

In [43]:
import os
import pickle
import pathlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

## Load data

In [2]:
KAGGLE_DATA_PATH = pathlib.Path.cwd().parent / "data" / "external" / "reddit-selfposts" / "reddit.tsv" 

In [3]:
data = pd.read_csv(KAGGLE_DATA_PATH, sep="\t")

In [4]:
data

Unnamed: 0,id,subreddit,title,selftext
0,6d8knd,talesfromtechsupport,Remember your command line switches...,"Hi there, <lb>The usual. Long time lerker, fi..."
1,58mbft,teenmom,"So what was Matt ""addicted"" to?",Did he ever say what his addiction was or is h...
2,8f73s7,Harley,No Club Colors,Funny story. I went to college in Las Vegas. T...
3,6ti6re,ringdoorbell,"Not door bell, but floodlight mount height.",I know this is a sub for the 'Ring Doorbell' b...
4,77sxto,intel,Worried about my 8700k small fft/data stress r...,"Prime95 (regardless of version) and OCCT both,..."
...,...,...,...,...
1012995,5r9k4h,MSLGame,Is this months rebirth and dungeon astro's wor...,I looking on what to evo3 farm next and was ex...
1012996,6529fp,CrohnsDisease,I might need a Medical leave from grad school,Has anyone here ever needed a medical leave fr...
1012997,7tiyzx,HongKong,Police harassing ethnic minorities in Hong Kong,I thought I'd make this post so that more peop...
1012998,664ha3,yorku,SU EECS 2030 and EECS 2021 - need advice,"Hi, I just finished 1st year EECS courses and ..."


In [6]:
data['selftext'][42]

'I fell asleep watching the Fantastic Beast movie so I’m guessing it wasn’t great.  I feel like it would be really well adapted into a TV show (not much knowledge on media rights).  Each chapter would be an episode and use different characters to show the discovery or a crazy moment involving the creature.  '

In [5]:
sample = data.sample(frac=0.1, replace=True, random_state=42)

In [6]:
sample.shape

(101300, 4)

In [7]:
train, test = train_test_split(sample, test_size=0.2, stratify=sample["subreddit"])

train.shape, test.shape

((81040, 4), (20260, 4))

In [8]:
X_train = train["selftext"]
X_test = test["selftext"]

y_train = train["subreddit"]
y_test = test["subreddit"]

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(81040,) (20260,)
(81040,) (20260,)


In [9]:
X_train.head()

729618    Previous owner had roof replaced/repaired not ...
326284    Bought this for my partner as a valentines' gi...
541065    This is sensational! my friends and I will lau...
922717    hi <lb>first i'm F2P i played this game since ...
287615    I have a BS degree in Civil Engineering from a...
Name: selftext, dtype: object

In [10]:
y_train.head()

729618          RealEstate
326284      whatsthisplant
541065             shrooms
922717    CaptainTsubasaDT
287615    civilengineering
Name: subreddit, dtype: object

In [11]:
# Encode the target using LabelEncoder

le = LabelEncoder()  
le.fit(y_train)  

y_train = le.transform(y_train)
y_test  = le.transform(y_test)

y_train[:8]

array([300, 993, 862,  62, 519, 183,  50, 380])

## Models

To do:
  
- Vectorize the data - TfidfVectorizer
> convert the words into numbers

- topic modelling

In [18]:
# Vectorize data

vect = TfidfVectorizer(
                    max_features=1000,
                    min_df=1,
                    ngram_range=(1, 2),
                    stop_words='english'
)

# Learn vocabulary and idf, return term-document matrix.
tdm = vect.fit_transform(X_train)

print(vect.get_feature_names())

['00', '000', '10', '100', '11', '12', '13', '14', '15', '16', '17', '18', '20', '2016', '2017', '24', '25', '30', '40', '50', '60', 'ability', 'able', 'absolutely', 'access', 'account', 'action', 'active', 'actual', 'actually', 'add', 'added', 'address', 'advance', 'advice', 'age', 'ago', 'air', 'album', 'allow', 'allowed', 'amazing', 'amazon', 'amp', 'amp nbsp', 'answer', 'answers', 'anxiety', 'anybody', 'anymore', 'anyways', 'app', 'apparently', 'apply', 'appreciate', 'appreciated', 'area', 'aren', 'art', 'ask', 'asked', 'asking', 'assume', 'attack', 'available', 'average', 'avoid', 'aware', 'away', 'awesome', 'baby', 'background', 'bad', 'ball', 'bar', 'base', 'based', 'basic', 'basically', 'bed', 'beginning', 'believe', 'best', 'better', 'big', 'bit', 'black', 'block', 'blood', 'blue', 'board', 'body', 'book', 'books', 'bought', 'box', 'brand', 'break', 'bring', 'brother', 'brought', 'budget', 'build', 'building', 'built', 'bunch', 'business', 'button', 'buy', 'buying', 'called', 

In [19]:
test = pd.DataFrame(tdm.todense(), columns=vect.get_feature_names())

In [20]:
test

Unnamed: 0,00,000,10,100,11,12,13,14,15,16,...,year,year old,years,years ago,yes,yesterday,young,youtube,youtube com,zero
0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0,0.0,...,0.000000,0.0,0.237150,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0,0.0,...,0.135125,0.0,0.130210,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0,0.0,...,0.000000,0.0,0.062204,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81035,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.22676,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81036,0.0,0.082243,0.0,0.0,0.148102,0.0,0.0,0.00000,0.0,0.0,...,0.101520,0.0,0.195655,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81037,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0,0.0,...,0.272408,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81038,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# algorithm{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}
nn = NearestNeighbors(n_neighbors=10, algorithm='brute', n_jobs=-1)

# Fit the model on TFidf Vectors
nn.fit(test)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [29]:
# https://www.reddit.com/r/learnprogramming/comments/g99at4/i_printed_hello_world_in_cobol/

test_input = """
I’m not much of a programmer, but when I saw that the world needs COBOL programmers right now, 
I thought I would do my best to help out, even though I knew nothing about the language. I’ve 
spent way too many hours over the past two weeks trying to get my system configured just to 
compile and run COBOL code. It might not seem like a big deal, but seeing those two words on 
the system output makes me feel like I can do anything!
"""

test_sparse = vect.transform([test_input])

In [30]:
test_sparse

<1x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 27 stored elements in Compressed Sparse Row format>

In [31]:
test_array = nn.kneighbors(test_sparse.todense(), n_neighbors=10)
test_array

(array([[1.        , 1.17545605, 1.18267843, 1.18894129, 1.18932033,
         1.19187979, 1.19381911, 1.19750593, 1.19750593, 1.1993946 ]]),
 array([[61881, 12118, 48449, 32744, 41322, 54627, 13228, 49087, 42154,
          5260]]))

In [32]:
rec_id_list = test_array[1][0]
rec_id_list

array([61881, 12118, 48449, 32744, 41322, 54627, 13228, 49087, 42154,
        5260])

In [33]:
recommendations = data.iloc[rec_id_list]["subreddit"]

In [34]:
recommendations


61881            learnpython
12118              indonesia
48449         CoDCompetitive
32744           communism101
41322            danganronpa
54627                 cancer
13228                lebanon
49087         SCREENPRINTING
42154    KingkillerChronicle
5260             techsupport
Name: subreddit, dtype: object

In [37]:
post_two = """

Michael Jordan on Isiah Thomas: "Whatever he says now, you know it wasn't his true actions then. 
He's had time to think about it. Or, the reaction of the public, that's kind of changed his 
perspective of it. You can show me anything you want. There's no way you can convince me he wasn't an asshole.
"""

In [40]:
def recommend(req, n=10):
    """Function to recommend top n subreddits given a request."""
    # Create vector from request
    req_vec = vect.transform([req])

    # Get indexes for n nearest neighbors
    top_id = nn.kneighbors(req_vec.todense(), n_neighbors=n)[1][0]

    # Index-locate the neighbors in original dataframe
    top_array = data.iloc[top_id]["subreddit"]

    return top_array

In [41]:
result = recommend(post_two)
result

61881      learnpython
30640             tall
66617          Stellar
78390        optometry
66999    crossdressing
50738          Cuckold
58302        photoshop
63971           German
63749     amateurradio
21173      musictheory
Name: subreddit, dtype: object

In [45]:
def picklizer():
    pass