In [38]:
import numpy as np
import pandas as pd
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [8]:
secure_df = pd.read_csv("Complete_Secure_Dataset.csv")

In [9]:
secure_df.drop("Unnamed: 0", axis=1, inplace=True)

In [10]:
secure_df.head()

Unnamed: 0,Bounce Type,Hard Bounces,Member_Rating,Opens,Soft Bounces,Successful Deliveries,Total Bounces,Total Recipients,campaign_id,email_id,Subject
0,sent,16.0,4.0,1.0,39.0,1784.0,55.0,1839.0,1,1282,Register as a member before New Years to get y...
1,sent,16.0,2.0,1.0,39.0,1784.0,55.0,1839.0,1,401,Register as a member before New Years to get y...
2,sent,16.0,3.0,3.0,39.0,1784.0,55.0,1839.0,1,1494,Register as a member before New Years to get y...
3,sent,16.0,2.0,1.0,39.0,1784.0,55.0,1839.0,1,1532,Register as a member before New Years to get y...
4,sent,16.0,2.0,1.0,39.0,1784.0,55.0,1839.0,1,411,Register as a member before New Years to get y...


In [11]:
secure_df.shape

(11577, 11)

In [12]:
#Let's dummy the bounce type into a sent dummy
secure_df['sent_dummy'] = [1 if i == 'sent' else 0 for i in secure_df['Bounce Type']]

In [13]:
secure_df.drop("Total Bounces", axis=1, inplace=True)

In [14]:
secure_df.drop("Bounce Type", axis=1, inplace=True)

In [15]:
# This one is the combined value of Successful deliveries and bounces, thus unneccessary
secure_df.drop('Total Recipients', axis = 1,inplace = True)

In [16]:
X = secure_df['Subject']
y = secure_df['Opens']

In [17]:
ss= StandardScaler()

In [18]:
lr = LinearRegression

In [19]:
from sklearn.metrics import mean_squared_error

In [20]:
def regression_machineCV(estimator, X, y, scoring = 'RMSE'):
    X = secure_df['Subject']    
    y = secure_df['Opens']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

    
    cv = CountVectorizer(max_features = 50, stop_words = stopwords.words('english'))
    cv.fit(X_train)
    X_train_scaled = cv.transform(X_train)
    X_test_scaled = cv.transform(X_test)
    estimators = [estimator] 
    for e in estimators:
        est = e()
        est.fit(X_train_scaled, y_train)
        train_score = est.score(X_train_scaled, y_train)
        r_sq = est.score(X_test_scaled, y_test)
        preds= est.predict(X_test_scaled)
        rmse= np.sqrt(mean_squared_error(y_test, preds))
        print(train_score)
    return ["our R Squared is {} and our RMSE is {}".format( r_sq, rmse)]

In [21]:
regression_machineCV(lr, secure_df['Subject'], secure_df['Opens'], scoring = 'RMSE')

0.005816235791066936


['our R Squared is 0.0008287453594035821 and our RMSE is 0.44683047874436443']

In [22]:
global of,tf,rf,ov,tv,rv,os,ts,rs

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
def regression_machinetfidf(estimator, X, y, scoring = 'RMSE'):
    X = secure_df['Subject']    
    y = secure_df['Opens']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

    
    tfidf = TfidfVectorizer(max_features = 100, stop_words = stopwords.words('english'))
    tfidf.fit(X_train)
    X_train_scaled = tfidf.transform(X_train)
    X_test_scaled = tfidf.transform(X_test)
    estimators = [estimator] 
    for e in estimators:
        est = e()
        est.fit(X_train_scaled, y_train)
        train_score = est.score(X_train_scaled, y_train)
        r_sq = est.score(X_test_scaled, y_test)
        preds= est.predict(X_test_scaled)
        rmse= np.sqrt(mean_squared_error(y_test, preds))
    return ["our R Squared is {} and our RMSE is {}.  Our predictions are {}".format( r_sq, rmse, preds)]

In [25]:
#Not promising! how's about a tfidf?  Slighly better but still not promising!
# This does slightly better because it normalizes the count but is still not meaningful.
regression_machinetfidf(lr, X, y, scoring = 'RMSE')

['our R Squared is 0.0034695695833167894 and our RMSE is 0.4801747358426549.  Our predictions are [0.11464968 0.1094196  0.16509434 ... 0.16509434 0.16509434 0.16509434]']

In [26]:
logr = LogisticRegression

How about classification with NLP on whether an email is sent to one's inbox?

In [41]:
def classification_machineCV(estimator, X, y, scoring = 'RMSE'):
    X = secure_df['Subject']    
    y = secure_df['sent_dummy']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state = 42)

    
    cv = CountVectorizer(max_features = 42420, stop_words = stopwords.words('english'))
    cv.fit(X_train)
    X_train_scaled = cv.transform(X_train)
    X_test_scaled = cv.transform(X_test)
    estimators = [estimator] 
    for e in estimators:
        est = e()
        est.fit(X_train_scaled, y_train)
        train_score = est.score(X_train_scaled, y_train)
        r_sq = est.score(X_test_scaled, y_test)
        preds= est.predict(X_test_scaled)
    
    return ["our training data has an r^2 of {} while our testing data has an r^2 of {}. Our predictions are {}".format(train_score, r_sq, preds)]

In [42]:
classification_machineCV(logr, secure_df['Subject'], secure_df['sent_dummy']
                         , scoring = 'RMSE')

['our training data has an r^2 of 0.9627470035633301 while our testing data has an r^2 of 0.9654576856649395. Our predictions are [1 1 1 ... 1 1 1]']

### These above values don't mean a lot as is, because generally speaking, most of the emails do get sent at that rate.    A more complex model and more data is needed.  

In [43]:
def classification_machineCV(estimator, X, y, scoring = 'RMSE'):
    X = secure_df['Subject']    
    y = secure_df['sent_dummy']
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=.2, random_state = 42)

    
    tfidf = TfidfVectorizer(max_features = 32222, stop_words = stopwords.words('english'))
    tfidf.fit(X_train)
    X_train_scaled = tfidf.transform(X_train)
    X_test_scaled = tfidf.transform(X_test)

    estimators = [estimator] 
    for e in estimators:
        est = e()
        est.fit(X_train_scaled, y_train)
        train_score = est.score(X_train_scaled, y_train)
        r_sq = est.score(X_test_scaled, y_test)
        preds= est.predict(X_test_scaled)
        print(preds)
    return ["our training data has an r^2 of {} while our testing data has an r^2 of {}. Our predictions are {}".format(train_score, r_sq, preds)]

In [44]:
classification_machineCV(logr, X, y, scoring = 'RMSE')

[1 1 1 ... 1 1 1]


['our training data has an r^2 of 0.9627470035633301 while our testing data has an r^2 of 0.9654576856649395. Our predictions are [1 1 1 ... 1 1 1]']

In [45]:
mlb = MultinomialNB

In [46]:
classification_machineCV(mlb, X, y, scoring = 'RMSE')

[1 1 1 ... 1 1 1]


['our training data has an r^2 of 0.9627470035633301 while our testing data has an r^2 of 0.9654576856649395. Our predictions are [1 1 1 ... 1 1 1]']

In [47]:
from sklearn.ensemble import RandomForestClassifier

In [48]:
rfc = RandomForestClassifier

In [49]:
classification_machineCV(rfc, X, y, scoring = 'RMSE')

[1 1 1 ... 1 1 1]




['our training data has an r^2 of 0.9627470035633301 while our testing data has an r^2 of 0.9654576856649395. Our predictions are [1 1 1 ... 1 1 1]']

NameError: name 's' is not defined