## Exploratory Data Analysis
**Get an understanding for which variables are important, view summary statistics, and visualize the data**

In [None]:
#Viz the balance of your label column
sns.countplot(data = df, x = 'label_col')

In [None]:
#Create a histogram of a feature
plt.figure(figsize=(10,5))
#usually want a hist figsize to be a long rectangle 
sns.histplot(data = df, x='col_a', bins = 30)

Explore correlation between the **continuous feature variables**

In [None]:
#Calculate the correlation
df.corr()

#Viz with a heatmap
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), cmap='icefire', annot=True)
##if a feature is a perfect predictor of a label then it isn't really a feature, it is probably duplicate information

#explore correlated continuous features with a scatterplot 
sns.scatterplot(data = df, x= 'col_a', y='col_b')

Explore correlation between the a **continuous** feature and a **categorical** value

In [None]:
#viz the correlation with a boxplot
sns.boxplot(data = df, x= 'categorical_col', y='continuous_col')

#calculate the summary statistics behind boxplot
df.groupby('categorical_col')['continuous_col'].describe()

#Viz- bar plot showing the correlation of the numeric features to a categorical column
df.corr()['categorical_col'].sort_values().drop('categorical_col').plot.bar()
##Which continuous features have the highest pos or neg correlation with the outcome of the categorical column

Explore features in more detail

In [None]:
#return all unique values
sorted(df['col_a'].unique())

#viz a distribuition of all unique values
sns.countplot(data = df, x = 'col_a', palette = 'coolwarm')
##differentiate by a categorical value with: hue='categorical_col'
##reorder x axis: feature_order = sorted(df['col_a'].unique()) **and**  order = feature_order


In [None]:
#Examining a df of some of the columns
df[['col_a','col_b', 'col_c']]

In [None]:
#Slicing a feature
someof_col_a = df[(df['col_a']=='PICK ME') | (df['col_a']=='LOOK AT ME')]
##someof_col_a is now the callable data to explore


## Data PreProcessing
**Remove or fill any missing data. Remove unnecessary or repetitive features. Convert categorical string features to dummy variables.**

In [None]:
df.describe()
#start by looking at distribution of values in each column

In [None]:
#Add a new column that converts categorical data to 0s and 1s
df['new_col'] = df['categorical_col'].map({'Occured':1,'Did Not Occur':0})

Missing Data- see if we should keep, discard, or fill in the missing data.

In [None]:
#Where are you missing data
df.isna().sum()

#percentage of total missing data
100* df.isnull().sum()/len(df)

#CAREFUL, if all missing data points are a VERY small percentage of the df, you can drop them
df = df.dropna()

In [None]:
#look at total unique values per column with some missing data, then see value counts for those
df['col_missing_data'].nunique()
df['col_missing_data'].value_counts()

#Viz a countplot of a column with missing data
desiredorder= sorted(df['col_missing_data'].dropna().unique())
sns.countplot(x='col_missing_data',data=df,order=desiredorder)
##if the sorted order isn't how you want to plot it, rewrite the order in a list as a variable 
##can add in countplot arg: hue= 'label_col'

#Examine ratio of label outcome per col_missing_data category
label_0 = df[df['label_col']=="zero"].groupby("col_missing_data").count()['label_col']
label_1 = df[df['label_col']=="one"].groupby("col_missing_data").count()['label_col']
label_ratio = label_0/label_1
label_ratio
label_ratio.plot(kind='bar')
##Is the label outcome similar across categories? If not maybe drop 'col_missing_data'

In [None]:
#delete a feature with too much missing data and unique values
df.drop('col_missing_data', axis=1, inplace=True)

Duplicate data

In [None]:
#Examine values for columns that may be repeated information
df['col_a']

Filling in missing data- Many ways to deal with missing data. We could attempt to build a simple model to fill it in, such as a linear model, we could just fill it in based on the mean of the other columns, or you could even bin the columns into categories and then set NaN as its own category.

In [None]:
#Create a value_counts of a column
df['col_c_missing_data'].value_counts()

#Review the other columsn to see which most highly correlates to col_c_missing_data
print("Correlation with the col_c_missing_data")
df.corr()['col_c_missing_data'].sort_values()

#If the feature that correlates most with col_c_missing_data makes sense, we can try a fillna() approach.
#Group the df by the correlated feature and calculate the mean value for the col_c_missing_data per correlated feature entry. 
print("Mean of col_c_missing_data column per correlated_feat")
correlated_feat_avg = df.groupby('correlated_feat').mean()['col_c_missing_data']

#Fill in the missing col_c_missing_data values based on their correlated_feat value. 
#Fill in that missing value with the mean value corresponding to its correlated_feat value from the Series we created above.
correlated_feat_avg[2.0]
##demonstrating that we can grab the value by entering the correlated_feat value as the slice **2.0 may need to change based on values from above Series**


def fill_col_c(correlated_feat,col_c_missing_data):
    '''
    Accepts both arg values for the row.
    Checks if the col_c_missing_data is NaN , if so, it returns the avg correlated_feat value
    for the corresponding col_c_missing_data value for that row.
    
    correlated_feat_avg here should be a Series or dictionary containing the mapping of the
    groupby averages of col_c_missing_data per correlated_feat values.
    '''
    if np.isnan(col_c_missing_data):
        return correlated_feat_avg[correlated_feat]
    else:
        return col_c_missing_data
    
df['col_c_missing_data'] = df.apply(lambda x: fill_col_c(x['correlated_feat'], x['col_c_missing_data']), axis=1)
**order of args in defined func needs to match order they are filled in the apply method**

**Categorical Variables and Dummy Variables**

In [None]:
# List all the columns that are currently non-numeric
df.select_dtypes(['object']).columns
##may want to do something with the string variables

#Turning strings to variables
df['col_d'] = df['col_d'].map({' 36 months': 36, ' 60 months': 60})

#Reducing the nunique for a string column
df['col_j'].value_counts()
df['col_j']=df['col_j'].replace(['dumbvalue1', 'dumbvalue2'], 'lessdumbvalue')
##'dumbvalue1', 'dumbvalue2','lessdumbvalue' are already values in value_counts and it makes logical sense to change them all to lessdumbvalue

#Feature enginer a zip code column from an address
df['zip_code'] = df['address'].apply(lambda address:address[-5:])
##here the zip code is the last 5 values in the address
##may need to add int(address[-5:])

#ensure none of the features are causing data leakage (giving away the label outcome)
df = df.drop('cheater_col',axis=1)

In [None]:
#Concert a string to dummy variables- remember to add drop_first=True to avoid duplicate info in the df
col_e_dummies = pd.get_dummies(df['col_e'],drop_first=True)

#concatenate these new columns to the original dataframe and drop the original str column
df = pd.concat([df.drop('col_e',axis=1),col_e_dummies],axis=1)

#do dummy variable creation, concat and column dropping for multiple features at once
dummies = pd.get_dummies(df[['col_x', 'col_y','col_z','col_w' ]],drop_first=True)
df = df.drop(['col_x', 'col_y','col_z','col_w' ],axis=1)
df = pd.concat([df,dummies],axis=1)

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('lavel_col',axis=1).values
y = df['label_col'].values

Grabbing a Sample for Saving Training Time

In [None]:
print(len(df))
small_df = df.sample(frac=0.1,random_state=101)
print(len(small_df))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=101)

In [None]:
X_train.shape
#Ensure the length of the df and the num of columns is what you expect

## Normalizing the Data

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
#We don't want data leakge from the test set so we only fit on the X_train data.

## Creating the Model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.constraints import max_norm

In [None]:
model = Sequential()


model.add(Dense(78, activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(39, activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(19, activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation = 'sigmoid'))

model.compile (optimizer = 'adam', loss= 'binary_crossentropy')

#This is for a categorical label and a 78 column X.train

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
early_stop = EarlyStopping(monitor = 'val_loss', mode='min', verbose = 1, patience = 25)

In [None]:
model.fit(x=X_train,
         y=y_train,
         epochs = 600,
         batch_size = 256,
         validation_data = (X_test, y_test), verbose =1,
         callbacks = [early_stop])

Save a model

In [None]:
from tensorflow.keras.models import load_model
model.save('model_name.h5')  

## Evaluating Model Performance

In [None]:
losses = pd.DataFrame(model.history.history)
losses[['loss','val_loss']].plot()

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
predictions = (model.predict(X_test) > 0.5).astype("int32")

In [None]:
print(classification_report(y_test, predictions))
print('\n')
print(confusion_matrix(y_test, predictions))

## Use Model to Predict an Outcome

In [None]:
import random
random.seed(101)
random_ind = random.randint(0,len(df))

new_item = df.drop('label_col',axis=1).iloc[random_ind]
new_item

model.predict_classes(new_item.values.reshape(-1,78))
##78 is the column num from x_train.shape

#check true label
df.iloc[random_ind]['label_col']