In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import dill
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score
from flask import Flask, request, jsonify

In [28]:
!pip install flask



In [29]:
df = pd.read_csv("./car_data.csv")
df

Unnamed: 0,User ID,Gender,Age,AnnualSalary,Purchased
0,385,Male,35,20000,0
1,681,Male,40,43500,0
2,353,Male,49,74000,0
3,895,Male,40,107500,1
4,661,Male,25,79000,0
...,...,...,...,...,...
995,863,Male,38,59000,0
996,800,Female,47,23500,0
997,407,Female,28,138500,1
998,299,Female,48,134000,1


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   User ID       1000 non-null   int64 
 1   Gender        1000 non-null   object
 2   Age           1000 non-null   int64 
 3   AnnualSalary  1000 non-null   int64 
 4   Purchased     1000 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 39.2+ KB


In [31]:
df['Purchased'].value_counts()

0    598
1    402
Name: Purchased, dtype: int64

In [32]:
X_train, X_test, y_train, y_test = train_test_split(df, df['Purchased'],
                                                    test_size=0.33, random_state=42)
# save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

# save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [33]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
       
class OHEEncoderBin(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        B = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        self.columns = B[:1]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
    
        return X[self.columns]     

In [34]:
features = ['Gender', 'Age', 'AnnualSalary']
target = 'Purchased'

In [35]:
Gender = Pipeline([
                ('selector', FeatureSelector(column='Gender')),
                ('ohe', OHEEncoderBin(key='Gender'))
            ])

Age =  Pipeline([
                ('selector', ColumnSelector(key=['Age']))])
                                                 
AnnualSalary = Pipeline([
                ('selector', ColumnSelector(key=['AnnualSalary']))])
               
feats = FeatureUnion([('Gender', Gender),
                      ('Age', Age),
                      ('AnnualSalary',AnnualSalary)])
                      

In [36]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

pipeline.fit(X_train, y_train)




Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoderBin(key='Gender'))])),
                                                ('Age',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key=['Age']))])),
                                                ('AnnualSalary',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key=['AnnualSalary']))]))])),
                ('classifier', RandomForestClassifier(random_state=42))])

In [37]:
with open("rf_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)

In [38]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")
X_test.head(3)

Unnamed: 0,User ID,Gender,Age,AnnualSalary,Purchased
0,176,Male,41,73500,0
1,448,Male,59,135500,1
2,391,Male,25,59500,0


In [39]:
with open('rf_pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [40]:
pipeline

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('Gender',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='Gender')),
                                                                 ('ohe',
                                                                  OHEEncoderBin(key='Gender'))])),
                                                ('Age',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key=['Age']))])),
                                                ('AnnualSalary',
                                                 Pipeline(steps=[('selector',
                                                                  ColumnSelector(key=['AnnualSalary']))]))])),
                ('classifier', RandomForestClassifier(random_state=42))])

In [41]:
preds = pipeline.predict_proba(X_test)[:, 1]

pred_df = pd.DataFrame({'preds': preds})
pred_df.to_csv("test_predictions.csv", index=None)

In [42]:
preds[:10]

array([0.14, 0.95, 0.  , 0.14, 0.87, 0.95, 0.99, 0.  , 0.  , 0.  ])

In [43]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.21583333333333332, F-Score=0.876, Precision=0.829, Recall=0.929


In [44]:
with open("rf_pipeline.dill", 'rb') as in_strm:
    model = dill.load(in_strm)

In [45]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [46]:
app = Flask(__name__)

@app.route("/", methods=["GET"])
def general():
    return "Welcome to prediction process"

@app.route('/predict', methods=['POST'])
def predict():
    data = {"success": False}

    # ensure an image was properly uploaded to our endpoint
    Gender, AnnualSalary, Age = "", "", ""
    request_json = request.get_json()
    
    if request_json["Gender"]:
        Gender = request_json['Gender']
    
    if request_json["AnnualSalary"]:
        AnnualSalary = request_json['AnnualSalary']
                
    if request_json["Age"]:
        Age = request_json['Age']
    
    preds = model.predict_proba(pd.DataFrame({"Gender": [Gender],
                                              "AnnualSalary": [AnnualSalary],
                                              "Age": [Age]}))
    data["predictions"] = preds[:, 1][0]
        # indicate that the request was a success
    data["success"] = True
    print('OK')

        # return the data dictionary as a JSON response
    return jsonify(data)


if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


In [47]:
Gender_data, AnnualSalary_data, Age_data = ( 
    "Female",
    "138500",
    "28"
)

body = {
        'Gender': Gender_data, 
        'AnnualSalary': AnnualSalary_data,
        'Age': Age_data
        }

In [48]:
with app.test_client() as t:
    response = t.post('/predict', json=body)
    json_data = response.get_json()

json_data

OK


{'predictions': 0.83, 'success': True}