In [1]:
import acquire as a
import prepare as p
import modeling as m
import os
import json
from typing import Dict, List, Optional, Union, cast

from bs4 import BeautifulSoup
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from sklearn.feature_extraction.text import CountVectorizer

from env import github_token, github_username

In [2]:
df = p.get_clean_df()

In [3]:
X_train, X_validate, X_test, y_train, y_validate, y_test = p.split_data(df, explore=False)

# Making bag of words

In [4]:
cv = CountVectorizer()
bag_of_words = cv.fit_transform(X_train.lemmatized)


In [6]:
bag_of_words

<241x4926 sparse matrix of type '<class 'numpy.int64'>'
	with 11514 stored elements in Compressed Sparse Row format>

In [7]:
bow = pd.DataFrame(bag_of_words.todense())
bow.columns = cv.get_feature_names()

In [8]:
bow

Unnamed: 0,aabb,aaro,ab,ability,abis,able,abramov,absence,absorb,abuild,...,york,youtube,yum,zdepth,zero,zip,ziptar,zodiusinfuser,zoom,zoomrotate
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
239,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
cv.get_feature_names()

['aabb',
 'aaro',
 'ab',
 'ability',
 'abis',
 'able',
 'abramov',
 'absence',
 'absorb',
 'abuild',
 'ac',
 'accelerated',
 'accepted',
 'accepting',
 'access',
 'accessed',
 'accessing',
 'accidentally',
 'accompanies',
 'accompanying',
 'according',
 'accordingly',
 'account',
 'accounted',
 'accounting',
 'accuracy',
 'accurate',
 'ace',
 'achievement',
 'acm',
 'across',
 'acting',
 'action',
 'actiongroups',
 'activate',
 'activated',
 'activatednot',
 'activation',
 'active',
 'actively',
 'activity',
 'actual',
 'actually',
 'ad',
 'adapt',
 'adapter',
 'adapting',
 'add',
 'added',
 'addedby',
 'addedimproved',
 'adding',
 'addingremoving',
 'addition',
 'additional',
 'additionally',
 'addon',
 'addremove',
 'address',
 'adecouplerstiffeningextensiontype',
 'adhered',
 'adjacency',
 'adji',
 'adjust',
 'adjustable',
 'adjusted',
 'adjustment',
 'adjusts',
 'admin',
 'adminautofilters',
 'administering',
 'administrating',
 'administration',
 'administrative',
 'adminpy',
 'ad

In [10]:
bow.apply(lambda row: row / row.sum(), axis=1)

Unnamed: 0,aabb,aaro,ab,ability,abis,able,abramov,absence,absorb,abuild,...,york,youtube,yum,zdepth,zero,zip,ziptar,zodiusinfuser,zoom,zoomrotate
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,0.0,0.009174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009174
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
237,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
238,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
239,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


# Making bag of bigrams

In [11]:
cv2 = CountVectorizer(ngram_range=(1, 2))
bag_of_grams = cv2.fit_transform(X_train.lemmatized)


In [12]:
pd.DataFrame(bag_of_grams.todense(),
            columns=cv2.get_feature_names())

Unnamed: 0,aabb,aabb return,aabb scaleamount,aaro,aaro sky,ab,ab noncommercial,ab testing,ability,ability create,...,zip file,zip gamedata,zip tar,ziptar,ziptar file,zodiusinfuser,zoom,zoom inout,zoomrotate,zoomrotate entire
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
239,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



# Count Vectorizer with DecisionTree

Bag Of Words

In [14]:
cvword = CountVectorizer()
X_bowT = cvword.fit_transform(X_train.lemmatized)
X_bowV = cvword.transform(X_validate.lemmatized)
tree1 = DecisionTreeClassifier(max_depth=9,random_state= 42)
tree1.fit(X_bowT, y_train)
print(tree1.score(X_bowT, y_train),
tree1.score(X_bowV,y_validate))

0.6763485477178424 0.5


Bag Of Bigrams

In [15]:
cvgram = CountVectorizer()git
X_bogT = cvgram.fit_transform(X_train.lemmatized)
x_bogV = cvgram.transform((X_validate.lemmatized))
tree2 = DecisionTreeClassifier(max_depth=3, random_state= 42)
tree2.fit(X_bog, y_train)
print(tree2.score(X_bog, y_train),tree2.score(x_bog,y_validate)

SyntaxError: unexpected EOF while parsing (941332556.py, line 5)

# Count Vectorizer with random Forest

Bag of Words

In [None]:
cvword = CountVectorizer()
X_bow = cvword.fit_transform(X_train.lemmatized)
Forest1 = RandomForestClassifier(max_depth=5, min_samples_leaf= 5,random_state= 42)
Forest1.fit(X_bow, y_train)
Forest1.score(X_bow, y_train)

In [None]:
cvgram = CountVectorizer(ngram_range=(1,2))
X_bow = cvgram.fit_transform(X_train.lemmatized)
Forest2 = RandomForestClassifier(max_depth=5, min_samples_leaf= 5, random_state= 42)
Forest2.fit(X_bow, y_train)
Forest2.score(X_bow, y_train)

# Bigram loop

In [None]:
'''for x in range(2,10):
    cv = CountVectorizer(ngram_range=(1,x ))
    hold = cv.fit_transform(X_train.lemmatized)
    Forest = RandomForestClassifier(max_depth=5, min_samples_leaf= 5)
    Forest.fit(hold, y_train)
    print(Forest2.score(hold, y_train))'''



REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression

Word

In [None]:
cvword = CountVectorizer()
X_bow = cvword.fit_transform(X_train.lemmatized)
logreg = LogisticRegression(C = .3,random_state = 42)
logreg.fit(X_bow,y_train)
logreg.score(X_bow,y_train) * 100

Bigram

In [None]:
cvword.get_feature_names_out()

In [None]:
cvword1 = CountVectorizer(vocabulary=cvword.get_feature_names_out())
X_bow1 = cvword.transform(X_validate.lemmatized)
logreg.score(X_bow1,y_validate) * 100

In [None]:
import nltk.sentiment
# we call nltk.sentiment.SentimentIntensityAnalyser()
# use polarity_scores from that object
sia = nltk.sentiment.SentimentIntensityAnalyzer()
df['sentiment'] = df['lemmatized'].apply(lambda doc: sia.polarity_scores(doc)['compound'])


In [None]:
df['lem_length'] = df['lemmatized'].str.len()
Python = df[df.language == 'Python']
JavaScript = df[df.language == 'JavaScript']
C = df[df.language == 'C#']
Java = df[df.language == 'Java']
print('python = ' , Python.lem_length.mean(), 'Javascript = ', JavaScript.lem_length.mean(), 'C# =', C.lem_length.mean(), 'Java =', Java.lem_length.mean())




In [None]:
df['lem_length']

In [None]:
df.lemmatized

In [None]:
len(Python.clean).mean()

In [None]:
df

In [6]:
m.run_decision_tree(4)

Best parameters per algorithm:
------------------------------------------------------------------------------------------------------------
Decision Tree Parameters:  {'max_depth': 6}
