In [None]:
import numpy as np
import pandas as pd

# acquire
from requests import get
from bs4 import BeautifulSoup
from time import sleep
import os

# prepare
import unicodedata
import re
import json
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# explore
from sklearn.model_selection import train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# model
from sklearn.linear_model import LogisticRegression
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report



In [None]:
import acquire
import prepare
import wrangle
import github_acquire

In [None]:
# df_raw = pd.read_json('repos.json')
# df.head()

In [None]:
# len(df_raw.language[df_raw.language.isnull()])

In [None]:
df = pd.read_json('repos_clean.json')
df.head()

In [None]:
df.sample(25)

In [None]:
# # add a column that is a list of each word for each repo --> PREPARE.PY
# words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df.clean] 

# # column name will be words, and the column will contain lists of the words in each doc
# df = pd.concat([df, pd.DataFrame({'words': words})], axis=1)
# df.head()

### Splitting the data

In [None]:


from sklearn.model_selection import train_test_split

train_validate, test = train_test_split(df[['language', 'lemmatized', 'clean']], 
                                        stratify=df.language, 
                                        test_size=.2, random_state = 123)

train, validate = train_test_split(train_validate, 
                                   stratify=train_validate.language, 
                                   test_size=.25, random_state = 123)


X_train = train.drop(columns = [target_variable])

In [None]:
y_train = train['language']
y_train

In [None]:
# what's the proportion of each language in our data
labels = pd.concat([train.language.value_counts(), # get total counts of each language
                    train.language.value_counts(normalize=True)], axis=1) # getting the prop of each language
labels.columns = ['n', 'percent']
labels

In [None]:
# split the data using train_test_split, doing it twice so that we have 
from sklearn.model_selection import train_test_split

train_validate, test = train_test_split(df[['language', 'lemmatized', 'clean']], 
                                        stratify=df.language, 
                                        test_size=.2, random_state = 123)

train, validate = train_test_split(train_validate, 
                                   stratify=train_validate.language, 
                                   test_size=.25, random_state = 123)

In [None]:
train.shape, validate.shape, test.shape

In [None]:
train_labels = pd.concat([train.language.value_counts(), # get total counts of ham vs spam
                    train.language.value_counts(normalize=True)], axis=1) # getting the prop of ham vs. spam

train_labels.columns = ['n', 'percent']
train_labels

#### Bag of Words Setup

In [None]:
# Create CountVectorizer, which create bag-of-words model.
# stop_words : Specify language to remove stopwords. 
# min_df: ignore terms that have a document frequency strictly 
# lower than the given threshold. This value is also called cut-off in the literature. 
# If float, the parameter represents a proportion of documents, integer absolute counts. 
# ngram_range: the lower and upper boundary of the range of n-values for 
# different word n-grams or char n-grams to be extracted. 

vectorizer = CountVectorizer(stop_words='english', 
                             min_df=20, 
                             ngram_range=(1,2), 
                             binary=True)

# Learn vocabulary in sentences. 
vectorizer.fit(train.clean)

# Get dictionary. 
vectorizer.get_feature_names()

In [None]:
# Transform each sentences in vector space.
bow = vectorizer.transform(train.clean)

# this is just to see the array of 0's and 1's
bow_array = bow.toarray()
bow_array[0]

#### TD-IDF Setup

In [None]:
tfidf = TfidfVectorizer(stop_words='english', min_df=20, 
                             ngram_range=(1,2), 
                             binary=True)

tfidf_sparse_matrix = tfidf.fit_transform(train.clean)
tfidf_sparse_matrix

In [None]:
pd.DataFrame(tfidf_sparse_matrix.todense(), columns=tfidf.get_feature_names()).head()

In [None]:
# Get vocabularies.
tfidf.vocabulary_

In [None]:
# Transform to document-term matrix
vector_spaces = tfidf.transform(X_train.clean)
vector_spaces.toarray()

#### Creating the X_train and y_train variables for modeling:

In [None]:
# Assigning the target:
y = train['language']

# Assigning the Features:
X_bow = bow

X_tfidf = tfidf_sparse_matrix

In [None]:
# Predicting based on BoW:

from sklearn.linear_model import LogisticRegression

lm = LogisticRegression().fit(X_bow, y)

train['predicted'] = lm.predict(X_bow)
train.head()

In [None]:
confusion_matrix(train.language, train.predicted)
pd.crosstab(train.language, train.predicted)

In [None]:
print(classification_report(train.language, train.predicted))

In [None]:
lm_tfidf = LogisticRegression().fit(X_tfidf, y)
train['pred_tfidf'] = lm_tfidf.predict(X_tfidf)

In [None]:
print(classification_report(train.language, train.pred_tfidf))

In [None]:
validate.shape