In [1]:
from sqlalchemy import create_engine, MetaData, inspect
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
#from sklearn.naive_bayes import NaiveBayes

### New heading

In [2]:
def get_project_folder():
    return os.path.dirname(os.getcwd());

In [3]:
project_folder = get_project_folder()
db_path = project_folder + '\\data\\raw\\database.sqlite'

In [4]:
# Connecting to SQLite DB
engine = create_engine('sqlite:///' + db_path)
engine.connect()
ins = inspect(engine)

In [5]:
def get_table_column_names(table_name):
    cols = ins.get_columns(table_name=table_name)
    return [d['name'] for d in cols]

In [6]:
result = engine.execute("SELECT * FROM reviews")
reviews = pd.DataFrame(result.fetchall(), columns=get_table_column_names('reviews'))

result = engine.execute("SELECT * FROM content")
content = pd.DataFrame(result.fetchall(), columns=get_table_column_names('content'))

reviews.drop_duplicates(subset='reviewid', inplace=True)
content.drop_duplicates(subset='reviewid', inplace=True)

In [7]:
content.sample(5)


Unnamed: 0,reviewid,content
8128,13906,That band name derives from singer/guitarist M...
10342,11316,Naming your group after yourself usually means...
3051,19404,This year has seen the 25th-anniversary reissu...
16756,7428,About the time A Series of Sneaks hit the scen...
11335,10272,\r\n Immediately on the Only Children's sec...


In [8]:
content.head()

Unnamed: 0,reviewid,content
0,22703,"“Trip-hop” eventually became a ’90s punchline,..."
1,22721,"Eight years, five albums, and two EPs in, the ..."
2,22659,Minneapolis’ Uranium Club seem to revel in bei...
3,22661,Kleenex began with a crash. It transpired one ...
4,22725,It is impossible to consider a given release b...


In [9]:
all_df = content.merge(reviews, on='reviewid')[['content', 'score']]
# (
#     content
#     .reset_index()
#     .merge(reviews, how='inner', on='reviewid')
#     .set_index('index')
#     [['content', 'score']]
# ).head()

In [10]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18389 entries, 0 to 18388
Data columns (total 2 columns):
content    18389 non-null object
score      18389 non-null float64
dtypes: float64(1), object(1)
memory usage: 431.0+ KB


In [11]:
reviews['reviewid'].value_counts().head()

2047     1
17053    1
10896    1
14994    1
12947    1
Name: reviewid, dtype: int64

In [12]:
cv = CountVectorizer(stop_words='english', min_df=10)

In [13]:
cv.fit(all_df['content'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
X = cv.transform(all_df['content'])

In [15]:
cv.vocabulary_

{'trip': 30614,
 'hop': 14113,
 'eventually': 10179,
 '90s': 265,
 'punchline': 23061,
 'music': 19336,
 'press': 22538,
 'shorthand': 26424,
 'hotel': 14190,
 'lounge': 17423,
 'today': 30100,
 'maligned': 17772,
 'subgenre': 28592,
 'feels': 10924,
 'like': 17063,
 'secret': 25876,
 'precedent': 22394,
 'listen': 17167,
 'canonical': 4290,
 'bristol': 3753,
 'scene': 25606,
 'albums': 950,
 'mid': 18570,
 'late': 16700,
 'genre': 12335,
 'starting': 28063,
 'chafe': 4705,
 'boundaries': 3479,
 'think': 29770,
 'claustrophobic': 5233,
 'anxious': 1423,
 '21st': 139,
 'century': 4678,
 'started': 28060,
 'years': 33194,
 'ahead': 859,
 'schedule': 25617,
 'looked': 17345,
 'right': 24809,
 'angle': 1285,
 'unbroken': 31020,
 'chain': 4708,
 'runs': 25227,
 'abrasion': 330,
 '80s': 249,
 'post': 22278,
 'punk': 23082,
 'ruminative': 25208,
 'pop': 22168,
 'dance': 7213,
 'fusion': 12077,
 'moment': 18953,
 'best': 2836,
 'aged': 813,
 'far': 10766,
 'gracefully': 12777,
 'forcefully': 1

# Deliverables for next time 

## Numerically predict continuous outcome from categorical independent variable

## X is encoded using CountVectorizer
    ## Build Naive Bayes model that takes X as input
    ## Linear regression - 
    ## RMS Error is our optimization metric
    ## Cross validation (BONUS)

## X is encoded using TFIDFVectorizer
    ## Build Naive Bayes model that takes X as input 
    ## Linear regression - 
    ## RMS Error is our optimization metric
    ## Cross validation (BONUS)

## Naive bayes with regression - see if this exists
    ## Not really. It can be done, but it doesn't work well
    
## Bayesian ridge regression

## Truncated SVD - (dim reduction) throw in sparse matrix (PCA and SVD works or ANY martrix that is dense - getting eigenvectors through different methods)
## NMF using the actual count # for the words, not just a binary 0, 1 (?)


## Numerically predict continuous outcome from categorical independent variable

## X is encoded using CountVectorizer
    ## Linear regression - using Bayesian ridge regression
    ## RMS Error is our optimization metric
    ## Cross validation (BONUS)

## X is encoded using TFIDFVectorizer 
    ## Linear regression - using Bayesian ridge regression 
    ## RMS Error is our optimization metric
    ## Cross validation (BONUS)

    
## Bayesian ridge regression

## Truncated SVD - (dim reduction) throw in sparse matrix (PCA and SVD works or ANY martrix that is dense - getting eigenvectors through different methods)
## NMF using the actual count # for the words, not just a binary 0, 1 (?)
