# Modeling

In [1]:
import pandas as pd
from split_get_scale import SplitGetScale
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split




df = pd.read_csv("nutrition_repos_clean_stemmed_lemmatize.csv")

In [2]:
# 20% test, 80% train_validate
# then of the 80% train_validate: 30% validate, 70% train.

train, test = train_test_split(df, test_size = .2, random_state = 123)
train, validate = train_test_split(train, test_size = .3, random_state = 123)

In [3]:
train.shape, validate.shape, test.shape


((65, 5), (28, 5), (24, 5))

In [4]:
# Validate my split.

print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

train -> (65, 5)
validate -> (28, 5)
test -> (24, 5)


In [5]:
# Function created to split dataset

def split_data(df):
    '''
    take in a DataFrame and return train, validate, and test DataFrames
    return train, validate, test DataFrames.
    '''
    train, test = train_test_split(df, test_size = .2, random_state = 123)
    train, validate = train_test_split(train, test_size = .3, random_state = 123)
    
    return train, validate, test

In [6]:
# apply split_data function to dataset

train, validate, test = split_data(df)

In [7]:
# Validate my split.

print(f'train -> {train.shape}')
print(f'validate -> {validate.shape}')
print(f'test -> {test.shape}')

train -> (65, 5)
validate -> (28, 5)
test -> (24, 5)


### Baseline

In [8]:
# establish baseline

baseline = pd.DataFrame(df.language)
baseline['baseline'] = 'other'

In [9]:
baseline

Unnamed: 0,language,baseline
0,Java,other
1,Java,other
2,JavaScript,other
3,other,other
4,Python,other
...,...,...
112,other,other
113,other,other
114,other,other
115,JavaScript,other


In [10]:
# establish baseline accuracy

baseline_accuracy = round((baseline.language == baseline.baseline).mean(),2)
baseline_accuracy

0.44

In [11]:
# split

sgs = SplitGetScale()
train, test = sgs.split(df)

In [12]:
(X_train_stemmed, y_train_stemmed), (X_test_stemmed, y_test_stemmed) = sgs.get_Xy(train, test, cols_train="clean_stemmed")
(X_train_lemmed, y_train_lemmed), (X_test_lemmed, y_test_lemmed) = sgs.get_Xy(train, test, cols_train="clean_lemmatized")

In [13]:
print("train: ", X_train_stemmed.shape, ", test: ", X_test_stemmed.shape)

train:  (93, 5547) , test:  (24, 5547)


In [14]:
print("train: ", y_train_lemmed.shape, ", test: ", y_test_lemmed.shape)

train:  (93,) , test:  (24,)


In [15]:
X_test_stemmed

<24x5547 sparse matrix of type '<class 'numpy.float64'>'
	with 2132 stored elements in Compressed Sparse Row format>

In [16]:
#KNN

from sklearn.neighbors import KNeighborsClassifier


# Create the object
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

#Fit the model
knn.fit(X_train_stemmed, y_train_stemmed)


KNeighborsClassifier()

In [17]:
#Make predictions on train
y_train_pred = knn.predict(X_train_stemmed)

#Estimate probability
y_train_pred_proba = knn.predict_proba(X_train_stemmed)

#Evaluate on accuracy

# assign accuracy to variable to call later
knn_train_accuracy = knn.score(X_train_stemmed, y_train_stemmed)

print('Accuracy of KNN classifier on training set: {:.3f}'
     .format(knn.score(X_train_stemmed, y_train_stemmed)))


Accuracy of KNN classifier on training set: 0.667


In [18]:
#Make predictions on validate
y_validate_pred = knn.predict(X_test_stemmed)

#Estimate probability
y_validate_pred_proba = knn.predict_proba(X_test_stemmed)

#Evaluate on accuracy
print('Accuracy of KNN classifier on test set: {:.3f}'
     .format(knn.score(X_test_stemmed, y_test_stemmed)))

# assign accuracy to variable to call later
knn_test_accuracy = round(knn.score(X_test_stemmed, y_test_stemmed),3)

Accuracy of KNN classifier on test set: 0.500


In [19]:
# lemmed

In [20]:
knn.fit(X_train_lemmed, y_train_lemmed)



KNeighborsClassifier()

In [21]:
y_test_pred = knn.predict(X_train_lemmed)

#Estimate probability
y_test_pred_proba = knn.predict_proba(X_train_lemmed)

#Evaluate on accuracy

# assign accuracy to variable to call later
knn_test_accuracy = knn.score(X_train_lemmed, y_train_lemmed)

print('Accuracy of KNN classifier on test set: {:.3f}'
     .format(knn.score(X_train_lemmed, y_train_lemmed)))



Accuracy of KNN classifier on test set: 0.624


In [22]:
y_test_pred = knn.predict(X_test_lemmed)

#Estimate probability
y_test_pred_proba = knn.predict_proba(X_test_lemmed)

#Evaluate on accuracy

# assign accuracy to variable to call later
knn_test_accuracy = knn.score(X_test_lemmed, y_test_lemmed)

print('Accuracy of KNN classifier on test set: {:.3f}'
     .format(knn.score(X_test_lemmed, y_test_lemmed)))



Accuracy of KNN classifier on test set: 0.417
