In [None]:
#Import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#Import machine learning library
import tensorflow

## Data Pre-Processing

In [None]:
#Read csv into dataframe
numbers = pd.read_csv('GetPhoneInfo/wrong_and_connect.csv')
numbers.head()

In [None]:
numbers2 = numbers.drop(columns =['Index','Toll free','First Name Match', 'Last Name Match', 'Unknown phone type', 'Possibly Portable VOIP'])
numbers.groupby('label').sum()

In [None]:
#Save labels and inputs as seperate variables
X = numbers3.drop(columns =["label",'OFFICE_TELEPHONE'])
y = numbers3['label']
print(X.shape, y.shape)

In [None]:
#Import functions
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [None]:
#Split training and testing data (75%/25%)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 12, stratify = y)

In [None]:
#Create a scaler model and fit it to the data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
#Transform str labels to numerical labels
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [None]:
#One-hot encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

## Create Deep Learning Model

In [None]:
#Import model and layers from keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
#Create model with 21 inputs, two hidden layers with 100 nodes, and 2 outputs
model = Sequential()
model.add(Dense(units=3, activation='relu', input_dim=29))
# model.add(Dense(units=3, activation='relu'))
# model.add(Dense(units=21, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [None]:
#Compile model
model.compile(optimizer ='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
#Print model summary
model.summary()

In [None]:
#Fit model to scaled data
history = model.fit(X_train_scaled,
         y_train_categorical,
         epochs=50,
         shuffle=True,
         verbose=2
         )

## Quantify Trained Model

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
# plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
#Print model loss and accuracy on testing dataset
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f'Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}')

In [None]:
train_loss, train_accuracy = model.evaluate(X_train_scaled, y_train_categorical)

In [None]:
#Get predictions
predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(predictions)

In [None]:
#Print prediction dataframe
data = {'Predicted': prediction_labels, 'Actual':list(y_test)}
pd.DataFrame(data)

In [None]:
#Save testing set to csv
X_test.to_csv("test.csv")

In [None]:
#Print confusion matrix
from sklearn.metrics import confusion_matrix

In [None]:
confusion = confusion_matrix(list(y_test), prediction_labels, labels = ['disconnected','connected'])
precision = confusion[0,0]/(confusion[0,0] + confusion[0,1])
recall = confusion[0,0]/(confusion[0,0] + confusion[1,0])
false_positives  = confusion[0,1]
print(f'''
      Precision: {precision}
      Recall: {recall}
      False Positives: {false_positives}
      ''')

In [None]:
confusion

## Save model

In [None]:
#Save model
model.save("GetPhoneInfo_model_trained_new.h5")

In [None]:
with_numbers = pd.read_csv('GetPhoneInfo/already_done.csv')

In [None]:
# with_numbers['Index'] = list(range(0,252))
actually_numbers=with_numbers[['Index','OFFICE_TELEPHONE','label']]


In [None]:
X_test['Predicted']=prediction_labels
X_test['Actual']=list(y_test)
X_test.to_csv('getphoneinfo_test_monday.csv')

In [None]:
boop = pd.read_csv('getphoneinfo_test_monday.csv')

In [None]:
boop = boop.rename(columns={'Unnamed: 0': 'Index'})

In [None]:
zoom = pd.merge(boop, actually_numbers, on='Index')

In [None]:
zoom.to_csv('getphoneinfo_model2_results.csv',index=False)

In [None]:
new_numbers = pd.read_csv('GetPhoneInfo/testing_set.csv')

In [None]:
#Save labels and inputs as seperate variables
testX = new_numbers.drop(columns =["Unnamed: 0",'OFFICE_TELEPHONE'])
# testy = numbers['label']
print(testX.shape)

In [None]:
testing_set = pd.read_csv('GetPhoneInfo/all_numbers.csv')
testX = testing_set.drop(columns=['OFFICE_TELEPHONE', 'Index', 'label'])
testY = testing_set['label']

In [None]:
predictions2 = model.predict_classes(testX)
prediction_labels2 = label_encoder.inverse_transform(predictions2)

In [None]:
goop = pd.DataFrame(prediction_labels2)

In [None]:
goop.to_csv('goop.csv')

In [None]:
new_list = [{'TrainAccuracy': train_accuracy,
             'TrainLoss': train_loss,
             'TestAccuracy': model_accuracy,
             'TestLoss': model_loss,
             'Precision': precision,
             'Recall':recall,
             'FalsePositives':false_positives
            }]
all_df = all_df.append(new_list)

In [None]:
all_df

In [None]:
def train_model(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 7, stratify = y)
    X_scaler = MinMaxScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    label_encoder = LabelEncoder()
    label_encoder.fit(y_train)
    encoded_y_train = label_encoder.transform(y_train)
    encoded_y_test = label_encoder.transform(y_test)
    y_train_categorical = to_categorical(encoded_y_train)
    y_test_categorical = to_categorical(encoded_y_test)
    model = Sequential()
    model.add(Dense(units=3, activation='relu', input_dim=X.shape[1]))
    # model.add(Dense(units=3, activation='relu'))
    # model.add(Dense(units=21, activation='relu'))
    model.add(Dense(units=2, activation='softmax'))
    model.compile(optimizer ='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(X_train_scaled,
         y_train_categorical,
         epochs=35,
         shuffle=True,
         verbose=2
         )
    # summarize history for accuracy
    plt.plot(history.history['acc'])
    # plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    # plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
    print(f'Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}')
    train_loss, train_accuracy = model.evaluate(X_train_scaled, y_train_categorical)
    predictions = model.predict_classes(X_test_scaled)
    prediction_labels = label_encoder.inverse_transform(predictions)
    confusion = confusion_matrix(list(y_test), prediction_labels, labels = ['disconnected','connected'])
    precision = confusion[0,0]/(confusion[0,0] + confusion[0,1])
    recall = confusion[0,0]/(confusion[0,0] + confusion[1,0])
    false_positives  = confusion[0,1]
    new_list = {'Variables': X.shape[1],
                 'TrainAccuracy': train_accuracy,
                 'TrainLoss': train_loss,
                 'TestAccuracy': model_accuracy,
                 'TestLoss': model_loss,
                 'Precision': precision,
                 'Recall':recall,
                 'FalsePositives':false_positives
                }
    print(new_list)
    return(model, new_list)

In [None]:
def train_many_models(df):
    col_list =[]
    for col in df.columns:
        col_list.append(col)
    col_list.remove('label')
    all_results = []
    X = df.drop(columns = ['label'])
    y = df['label']
    model, first_model_results = train_model(X,y)
        #pd.DataFrame(columns=['Variables','TrainAccuracy','TestAccuracy','Precision','Recall','FalsePositives'])
    for col in col_list:
        print(f'Take out {col}')
        variable = col
        X = df.drop(columns = ['label',col])
        y = df['label']
        model_new, model_results = train_model(X,y)
        model_results['Variable']=variable
        all_results.append(model_results)
    new_df = pd.DataFrame(all_results)
    new_df = new_df.sort_values(by=['Precision'])
    best_precision = new_df.loc[new_df['Precision'].idxmax()]['Precision']
    next_variable = new_df.loc[new_df['Precision'].idxmax()]['Variable']
    return(model, first_model_results, new_df)

In [None]:
def find_best(df):
    continue_ = True
    this_list = []
    while continue_ == True:
        model, first_model_results, new_df = train_many_models(df)
        og_precision = first_model_results['Precision']
        best_precision = new_df.loc[new_df['Precision'].idxmax()]['Precision']
        this_list.append(new_df)
        if best_precision > og_precision:
            next_variable = new_df.loc[new_df['Precision'].idxmax()]['Variable']
            df = df.drop(columns = [next_variable])
        else:
            continue_ == False
            column_list= []
            for col in df.columns:
                column_list.append(col)
    return(model, first_model_results, new_df, column_list, this_list)

In [None]:
final_model, final_model_results, final_df, best_variables, results = find_best(numbers)

In [None]:
X.shape[0]

In [None]:
numbers4 = numbers3.drop(columns='Relevant SIC')

In [None]:
stats_df

In [None]:
new_df

In [None]:
first_model_results

In [None]:
RPV = pd.read_csv('RPV_archive.csv')
def compare_rpv(model, X_test, predicted, y_test):
    X_test['predicted'] = predicted
    X_test['actual']=y_test
    X_test.to_csv('matching2.csv')
    matching = pd.read_csv('matching2.csv')
    matching = matching.rename(columns={'Unnamed: 0':'Index'})
    JUST_NUMS = new_numbers[['Index', 'OFFICE_TELEPHONE']]
    woop = pd.merge(matching, JUST_NUMS, on='Index')
    yummu = pd.merge(woop, RPV, left_on='OFFICE_TELEPHONE', right_on = 'phone')
    yummu[['status', 'predicted','actual']]
    grouped = yummu.groupby(['actual', 'status', 'predicted']).count()
    return(yummu, grouped)

In [None]:
X = new_numbers.drop(columns =["label",'OFFICE_TELEPHONE'])
y = new_numbers['label']
model1, new_list1, X_test1, y_test1 = train_model(X,y)

In [None]:
new_numbers = numbers.drop(columns=['Toll free', 'Unknown phone type', 'Workplace Match', 'Wireless type', 'Wireless note', 'No Geocoordinates', 'Last Name Match', 'Landline', 'First Name Match'])

In [None]:
all_stuff, grouped = compare_rpv(model1, X_test1, y_test1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 7, stratify = y)
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
model = Sequential()
model.add(Dense(units=3, activation='relu', input_dim=X.shape[1]))
    # model.add(Dense(units=3, activation='relu'))
    # model.add(Dense(units=21, activation='relu'))
model.add(Dense(units=2, activation='softmax'))
model.compile(optimizer ='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train_scaled,
         y_train_categorical,
         epochs=35,
         shuffle=True,
         verbose=2
         )
    # summarize history for accuracy
plt.plot(history.history['acc'])
    # plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper left')
plt.show()
    # summarize history for loss
plt.plot(history.history['loss'])
    # plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper left')
plt.show()
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f'Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}')
train_loss, train_accuracy = model.evaluate(X_train_scaled, y_train_categorical)
predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(predictions)
confusion = confusion_matrix(list(y_test), prediction_labels, labels = ['disconnected','connected'])
precision = confusion[0,0]/(confusion[0,0] + confusion[0,1])
recall = confusion[0,0]/(confusion[0,0] + confusion[1,0])
false_positives  = confusion[0,1]
new_list = {'Variables': X.shape[1],
                 'TrainAccuracy': train_accuracy,
                 'TrainLoss': train_loss,
                 'TestAccuracy': model_accuracy,
                 'TestLoss': model_loss,
                 'Precision': precision,
                 'Recall':recall,
                 'FalsePositives':false_positives
                }

In [None]:
new_list

In [None]:
all_stuff, grouped = compare_rpv(model, X_test, prediction_labels, y_test)

In [None]:
X_test['predictions']=prediction_labels

In [None]:
def compare_rpv(X_test, y_test, prediction_labels):
    X_test['predicted'] = prediction_labels
    X_test['actual']=y_test
    index_list = []
    for row in X_test.itertuples():
        index_list.append(row._1)
    X_test['Index']=index_list
    JUST_NUMS = new_numbers[['Index', 'OFFICE_TELEPHONE']]
    woop = pd.merge(X_test, JUST_NUMS, on='Index')
    yummu = pd.merge(woop, RPV, left_on='OFFICE_TELEPHONE', right_on = 'phone')
    yummu[['status', 'predicted','actual']]
    grouped = yummu.groupby(['actual', 'status', 'predicted']).count()
    return(grouped)

In [None]:
new_df.iloc[:,1]

In [None]:
for col in df.columns:
    print(col)

In [None]:
numbers = numbers.drop(columns='Toll free')

In [None]:
new_numbers = numbers[['label','Address Match','City Match','Connected','Disconnected','High Quality','INF','Low Quality','NO SIC','No Date','Ported','Relevant Name','Relevant SIC','State Match','ZipCode Match']]

In [None]:
X = new_numbers.drop(columns =["label"])
y = new_numbers['label']
print(X.shape, y.shape)

In [None]:
train_model(X,y)

In [None]:
find_best(new_numbers)

In [None]:
new_new_numbers = new_numbers[['ZipCode Match','Connected','High Quality', 'INF','NO SIC','Ported','Relevant Name','label']]

In [None]:
X = new_new_numbers.drop(columns =["label"])
y = new_new_numbers['label']
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 7, stratify = y)
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
model = Sequential()
model.add(Dense(units=3, activation='relu', input_dim=X.shape[1]))
# model.add(Dense(units=5, activation='relu'))
    # model.add(Dense(units=21, activation='relu'))
model.add(Dense(units=2, activation='softmax'))
model.compile(optimizer ='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train,
         y_train_categorical,
         epochs=25,
         shuffle=True,
         verbose=2
         )
    # summarize history for accuracy
plt.plot(history.history['acc'])
    # plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper left')
plt.show()
    # summarize history for loss
plt.plot(history.history['loss'])
    # plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper left')
plt.show()
model_loss, model_accuracy = model.evaluate(X_test, y_test_categorical, verbose=2)
print(f'Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}')
train_loss, train_accuracy = model.evaluate(X_train, y_train_categorical)
predictions = model.predict_classes(X_test)
prediction_labels = label_encoder.inverse_transform(predictions)
confusion = confusion_matrix(list(y_test), prediction_labels, labels = ['disconnected','connected'])
precision = confusion[0,0]/(confusion[0,0] + confusion[0,1])
recall = confusion[0,0]/(confusion[0,0] + confusion[1,0])
false_positives  = confusion[0,1]
new_list = {'Variables': X.shape[1],
                 'TrainAccuracy': train_accuracy,
                 'TrainLoss': train_loss,
                 'TestAccuracy': model_accuracy,
                 'TestLoss': model_loss,
                 'Precision': precision,
                 'Recall':recall,
                 'FalsePositives':false_positives
                }
print(new_list)

In [None]:
confusion

In [None]:
X_test['predicted'] = prediction_labels
X_test['actual']=y_test
X_test.to_csv('matching2.csv')
matching = pd.read_csv('matching2.csv')
matching = matching.rename(columns={'Unnamed: 0':'Index'})
JUST_NUMS = numbers[['Index', 'OFFICE_TELEPHONE']]
woop = pd.merge(matching, JUST_NUMS, on='Index')
yummu = pd.merge(woop, RPV, left_on='OFFICE_TELEPHONE', right_on = 'phone')
yummu[['status', 'predicted','actual']]
grouped = yummu.groupby(['actual', 'status', 'predicted']).count()

In [None]:
RPV = RPV.drop_duplicates(subset='phone', keep='last')

In [None]:
confusion

In [None]:
yummu.groupby(['status', 'predicted','actual']).count()

In [None]:
grouped

In [None]:
data = {'Predicted': prediction_labels, 'Actual':list(y_test)}
Z = pd.DataFrame(data)
Z.groupby('Actual').sum()