In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/

import pickle
from tqdm import tqdm
import os

import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
from collections import Counter

Output hidden; open in https://colab.research.google.com to view.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1. Loading Data

In [2]:
data  = pd.read_csv('/content/drive/MyDrive/Resume Project/Donors Choose Dataset/project_data.csv', nrows = 50000)
# data  = pd.read_csv('preprocessed_data.csv', nrows=50000) # you can take less number of rows like this
data.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,...,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved,essay,price,quantity,std_price,nrm_price
0,0,160221,p253737,c90749f5d961ff158d4b4d1e7dc665fc,mrs,in,2016-12-05 13:43:57,grades_prek_2,literacy_language,esl_literacy,...,,,My students need opportunities to practice beg...,0,0,My students are English learners that are work...,154.6,23,-0.390533,0.015397
1,1,140945,p258326,897464ce9ddc600bced1151f324dd63a,mr,fl,2016-10-25 09:22:10,grades_6_8,history_civics_health_sports,civics_government_teamsports,...,,,My students need a projector to help with view...,7,1,Our students arrive to our school eager to lea...,299.0,1,0.002396,0.029839


In [3]:
print(data.columns)

Index(['Unnamed: 0', 'Unnamed: 0.1', 'id', 'teacher_id', 'teacher_prefix',
       'school_state', 'project_submitted_datetime', 'project_grade_category',
       'project_subject_categories', 'project_subject_subcategories',
       'project_title', 'project_essay_1', 'project_essay_2',
       'project_essay_3', 'project_essay_4', 'project_resource_summary',
       'teacher_number_of_previously_posted_projects', 'project_is_approved',
       'essay', 'price', 'quantity', 'std_price', 'nrm_price'],
      dtype='object')


In [6]:
print("Before Upsampling::", data['project_is_approved'].value_counts())
print("*"*50)
from sklearn.utils import resample
class_0 = data[data['project_is_approved']==0]
class_1 = data[data['project_is_approved']==1]
# Upsample minority class
class_0_upsampled = resample(class_0, 
                                 replace=True,     # sample with replacement
                                 n_samples=class_1.shape[0],    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
data = pd.concat([class_0_upsampled, class_1])
 
print("After Upsampling::",data['project_is_approved'].value_counts())

Before Upsampling:: 1    42286
0     7714
Name: project_is_approved, dtype: int64
**************************************************
After Upsampling:: 0    42286
1    42286
Name: project_is_approved, dtype: int64


# 2. Vectorizing Text data

## 2.1 Bag of words

In [7]:
preprocessed_essays = data['essay'].values

In [8]:
# We are considering only the words which appeared in at least 10 documents(rows or projects).
vectorizer = CountVectorizer(min_df=10)
text_bow = vectorizer.fit_transform(preprocessed_essays)
print("Shape of matrix after one hot encodig ",text_bow.shape)

Shape of matrix after one hot encodig  (84572, 16037)


## 2.2 TFIDF vectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=10)
text_tfidf = vectorizer.fit_transform(preprocessed_essays)
print("Shape of matrix after one hot encodig ",text_tfidf.shape)

Shape of matrix after one hot encodig  (84572, 16037)


## 2.3 Using Pretrained Models: Avg W2V

In [10]:
#please use below code to load glove vectors
import pickle 
with open('/content/drive/MyDrive/ML Assignments/Assignment_6:Apply NB on Donors Choose /glove_vectors', 'rb') as f:
    model = pickle.load(f)
    glove_words =  set(model.keys())

In [11]:
# average Word2Vec
# compute average word2vec for each review.
avg_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_essays): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if word in glove_words:
            vector += model[word]
            cnt_words += 1
    if cnt_words != 0:
        vector /= cnt_words
    avg_w2v_vectors.append(vector)

print(len(avg_w2v_vectors))
print(len(avg_w2v_vectors[0]))

100%|██████████| 84572/84572 [00:37<00:00, 2271.37it/s]

84572
300





## 2.4 Using Pretrained Models: TFIDF weighted W2V

In [12]:
# S = ["abc def pqr", "def def def abc", "pqr pqr def"]
tfidf_model = TfidfVectorizer()
tfidf_model.fit(preprocessed_essays)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())

In [13]:
# average Word2Vec
# compute average word2vec for each review.
tfidf_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_essays): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors.append(vector)

print(len(tfidf_w2v_vectors))
print(len(tfidf_w2v_vectors[0]))

100%|██████████| 84572/84572 [05:33<00:00, 253.24it/s]

84572
300





# 2. Vectorizing Categorical Features

In [14]:
# provided we did the cleaning
vectorizer = CountVectorizer(binary=True)
school_state_ohe = vectorizer.fit_transform(data['school_state'].values)
print("Shape of matrix after one hot encodig ",school_state_ohe.shape)

Shape of matrix after one hot encodig  (84572, 51)


In [15]:
y = data['project_is_approved'].values
x= data.drop(['project_is_approved'] ,axis=1)
X_train,X_test,y_train,y_test = train_test_split(x,y , test_size=0.2, stratify = y)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_model = TfidfVectorizer(min_df=10,max_features=300)
tfidf_model.fit(X_train['essay'].values)
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())
train_essay = tfidf_model.transform(X_train['essay'].values)
test_essay = tfidf_model.transform(X_test['essay'].values)

In [17]:
X_train.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,...,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,essay,price,quantity,std_price,nrm_price
28345,28345,147158,p097367,1a056bdfa266e4d5bf99eb9e98738202,ms,ny,2016-08-12 17:04:45,grades_prek_2,literacy_language_music_arts,literature_writing_visualarts,...,It's almost that time of year again! Teachers ...,,,"My students need school supplies (crayons, mar...",38,I work at a school where many of my students r...,120.12,3,-0.484357,0.011948
27374,27374,122829,p077518,706d3ccc86f4538759dd9a3e4e2b17e0,ms,mo,2017-02-07 13:26:08,grades_prek_2,literacy_language_math_science,literacy_mathematics,...,"By having access to these tablets, my students...",,,My students need these tablets in order for th...,2,My classroom is a kindergarten and 1st grade c...,79.99,10,-0.593555,0.007934


In [18]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(X_train['teacher_prefix'].values)
train_teacher = cv.transform(X_train['teacher_prefix'].values)
test_teacher = cv.transform(X_test['teacher_prefix'].values)
cv = CountVectorizer()
cv.fit(X_train['school_state'].values)
train_school_state = cv.transform(X_train['school_state'].values)
test_school_state = cv.transform(X_test['school_state'].values)
cv = CountVectorizer()
cv.fit(X_train['project_grade_category'].values)
train_project_grade_category = cv.transform(X_train['project_grade_category'].values)
test_project_grade_category = cv.transform(X_test['project_grade_category'].values)
cv = CountVectorizer()
cv.fit(X_train['project_subject_categories'].values)
train_clean_categories = cv.transform(X_train['project_subject_categories'].values)
test_clean_categories = cv.transform(X_test['project_subject_categories'].values)
cv = CountVectorizer()
cv.fit(X_train['project_subject_subcategories'].values)
train_clean_subcategories = cv.transform(X_train['project_subject_subcategories'].values)
test_clean_subcategories = cv.transform(X_test['project_subject_subcategories'].values)

In [19]:
print("Train_Data_Dimension",X_train.shape,y_train.shape)
print("Test_Data_Dimension",X_test.shape,y_test.shape)
print('='*80)
vectorizer = CountVectorizer(min_df=10)
vectorizer.fit(X_train['essay'].values)
train_essay = vectorizer.transform(X_train['essay'].values)
test_essay = vectorizer.transform(X_test['essay'].values)
print("Train",train_essay.shape,y_train.shape)
print("Test",test_essay.shape,y_test.shape)
print('='*80)
vectorizer = CountVectorizer()
vectorizer.fit(X_train['school_state'].values)
train_state = vectorizer.transform(X_train['school_state'].values)
test_state= vectorizer.transform(X_test['school_state'].values)
print("Train",train_state.shape,y_train.shape)
print("Test",test_state.shape,y_test.shape)
print('='*80)
vectorizer = CountVectorizer()
vectorizer.fit(X_train['teacher_prefix'].values)
train_teacher = vectorizer.transform(X_train['teacher_prefix'].values)
test_teacher= vectorizer.transform(X_test['teacher_prefix'].values)
print("Train",train_teacher.shape,y_train.shape)
print("Test",test_teacher.shape,y_test.shape)
print('='*80)
vectorizer = CountVectorizer()
vectorizer.fit(X_train['project_grade_category'].values)
train_projectgrade = vectorizer.transform(X_train['project_grade_category'].values)
test_projectgrade= vectorizer.transform(X_test['project_grade_category'].values)
print("Train ",train_projectgrade.shape,y_train.shape)
print("Test ",test_projectgrade.shape,y_test.shape)
print('='*80)
vectorizer = CountVectorizer()
vectorizer.fit(X_train['project_subject_categories'].values)
train_category = vectorizer.transform(X_train['project_subject_categories'])
test_category = vectorizer.transform(X_test['project_subject_categories'])
print("Train_",train_category.shape,y_train.shape)
print("Test_ ",test_category.shape,y_test.shape)
print('='*80)
vectorizer = CountVectorizer()
vectorizer.fit(X_train['project_subject_subcategories'].values)
train_subcategory = vectorizer.transform(X_train['project_subject_subcategories'])
test_subcategory = vectorizer.transform(X_test['project_subject_subcategories'])
print("Train_subcategory ",train_subcategory.shape,y_train.shape)
print("Test_subcategory ",test_subcategory.shape,y_test.shape)

Train_Data_Dimension (67657, 22) (67657,)
Test_Data_Dimension (16915, 22) (16915,)
Train (67657, 14499) (67657,)
Test (16915, 14499) (16915,)
Train (67657, 51) (67657,)
Test (16915, 51) (16915,)
Train (67657, 5) (67657,)
Test (16915, 5) (16915,)
Train  (67657, 4) (67657,)
Test  (16915, 4) (16915,)
Train_ (67657, 50) (67657,)
Test_  (16915, 50) (16915,)
Train_subcategory  (67657, 373) (67657,)
Test_subcategory  (16915, 373) (16915,)


In [20]:
from sklearn.preprocessing import Normalizer
norm = Normalizer()
norm = norm.fit(X_train['price'].values.reshape(-1,1))
train_norm_price = norm.transform(X_train['price'].values.reshape(-1,1))
test_norm_price =norm.transform(X_test['price'].values.reshape(-1 ,1))
norm = Normalizer()
norm = norm.fit(X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
train_norm_teacher_number = norm.transform(X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
test_norm_teacher_number =norm.transform(X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1 ,1))

In [21]:
from scipy.sparse import hstack
X_train_bow = hstack((train_essay,train_teacher,train_projectgrade,train_state,train_category,train_subcategory,train_norm_price,train_norm_teacher_number)).tocsr()
X_test_bow = hstack((test_essay,test_teacher,test_projectgrade,test_state,test_category,test_subcategory,test_norm_price,test_norm_teacher_number)).tocsr()

In [22]:
essay =X_train['essay'].values
vectorizer = TfidfVectorizer(min_df=10)
vectorizer.fit(essay)
train_essay_tf = vectorizer.transform(X_train['essay'].values)
test_essay_tf = vectorizer.transform(X_test['essay'].values)
print("Train",train_essay_tf.shape,y_train.shape)
print("Test",test_essay_tf.shape,y_test.shape)
print('='*80)

Train (67657, 14499) (67657,)
Test (16915, 14499) (16915,)


In [23]:
from scipy.sparse import hstack
X_train_tf= hstack((train_essay_tf,train_teacher,train_projectgrade,train_state,train_category,train_subcategory,train_norm_price,train_norm_teacher_number)).tocsr()
X_test_tf= hstack((test_essay_tf,test_teacher,test_projectgrade,test_state,test_category,test_subcategory,test_norm_price,test_norm_teacher_number)).tocsr()

## ***Dumping Train & Test data in PKL format***

In [24]:
import pickle

In [25]:
pickle.dump(X_train_bow,open("/content/drive/MyDrive/Resume Project/Donors Choose Dataset/X_train_1.pkl", 'wb'))
pickle.dump(X_test_bow,open("/content/drive/MyDrive/Resume Project/Donors Choose Dataset/X_test_1.pkl", 'wb'))
pickle.dump(X_train_tf,open("/content/drive/MyDrive/Resume Project/Donors Choose Dataset/X_train_2.pkl", 'wb'))
pickle.dump(X_test_tf,open("/content/drive/MyDrive/Resume Project/Donors Choose Dataset/X_test_2.pkl", 'wb'))
pickle.dump(y_train,open("/content/drive/MyDrive/Resume Project/Donors Choose Dataset/y_train.pkl", 'wb'))
pickle.dump(y_test,open("/content/drive/MyDrive/Resume Project/Donors Choose Dataset/y_test.pkl", 'wb'))