In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [3]:
train1  = train.copy()
test1 = test.copy()

In [4]:
test1['Sex'].value_counts()

male      266
female    152
Name: Sex, dtype: int64

In [5]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [6]:
test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [7]:
# Select only 3 columns for demonstration of deployment using Flask app
#'Age', 'Sex', 'Embarked', 'Survived'
train = train[['Age', 'Sex', 'Embarked', 'Survived']]
test = test[['Age', 'Sex', 'Embarked',]]
train.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,1
2,26.0,female,S,1
3,35.0,female,S,1
4,35.0,male,S,0


In [8]:
test.head()

Unnamed: 0,Age,Sex,Embarked
0,34.5,male,Q
1,47.0,female,S
2,62.0,male,Q
3,27.0,male,S
4,22.0,female,S


# preprocessing
"Sex" and "Embarked" are categorical features with non-numeric values
and that is why they require some numeric transformations.
“Age” feature has missing values. These values can be imputed with a summary statistic
such as median or mean. Missing values can be quite meaningful, and it is worth investigating 
what they represent in real-world applications.

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 28.0+ KB


In [10]:
# scikit-learn treats the cell values which do not contain anything as NaNs. Here, you will 
# merely replace NaNs with 0, and you will write a helper function for that.

In [11]:
categoricals = []
for col, col_type in train.dtypes.iteritems():
    if col_type == 'O':
        categoricals.append(col)
    else:
        train[col].fillna(0,inplace=True)    

In [12]:
train['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [13]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [14]:
categoricals = []
for col, col_type in test.dtypes.iteritems():
    if col_type == 'O':
        categoricals.append(col)
    else:
        test[col].fillna(0,inplace=True)    

In [15]:
# Dummify
train_dum = pd.get_dummies(train, columns= categoricals,dummy_na = True)
test_dum = pd.get_dummies(test,columns=categoricals,dummy_na=True)
print(train_dum.columns)
print(test_dum.columns)

Index(['Age', 'Survived', 'Sex_female', 'Sex_male', 'Sex_nan', 'Embarked_C',
       'Embarked_Q', 'Embarked_S', 'Embarked_nan'],
      dtype='object')
Index(['Age', 'Sex_female', 'Sex_male', 'Sex_nan', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Embarked_nan'],
      dtype='object')


In [16]:
train_dum =train_dum [['Age', 'Survived', 'Sex_female', 'Sex_male', 'Embarked_C','Embarked_Q', 'Embarked_S']]
test_dum = test_dum[['Age', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q','Embarked_S']]


In [17]:
print(train_dum.shape)
print(test_dum.shape)

(891, 7)
(418, 6)


In [18]:
# Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression
dependent_variable = 'Survived'
x = train_dum[train_dum.columns.difference([dependent_variable])]
y = train_dum[dependent_variable]
lr = LogisticRegression()
lr.fit(x,y)
pred_titanic = lr.predict(test_dum)
pred_titanic
submission = pd.DataFrame({
    "PassengerId" : test1["PassengerId"],
    "Survived" : pred_titanic
})
submission.to_csv("submission11.csv",index=False)



In [None]:
#NB

In [21]:

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x,y)
pred_NB = gnb.predict(test_dum)
pred_NB
submission = pd.DataFrame({
    "PassengerId" : test1["PassengerId"],
    "Survived" : pred_NB
})
submission.to_csv("sub_NB.csv",index = False)

In [22]:
#svc

In [23]:
from sklearn.svm import LinearSVC
svc_model = LinearSVC()
svc_model.fit(x,y)
svc_pred = svc_model.predict(test_dum)
submission = pd.DataFrame({
            "PassengerId" : test1['PassengerId'],
            "Survived" : svc_pred
})
submission.to_csv("sub_svc.csv",index = False)



In [24]:
#SGD Classifier

In [26]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
sgd.fit(x,y)
sgd_sub = sgd.predict(test_dum)
submission = pd.DataFrame({"PassengerId":test1["PassengerId"],"Survived":sgd_sub})
submission.to_csv("sub_SGD1.csv",index=True)




In [None]:
#KNN

In [28]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(x,y)
knn_pred = neigh.predict(test_dum)
submission = pd.DataFrame({
                "PassengerId" : test1["PassengerId"],
                "Survived" : knn_pred
})
submission.to_csv("sub_KNN.csv",index=True)

In [29]:
# Saving the model: Serialization and Deserialization
# By submitting in Kaggle got some what good accuracy with 4 - variables 
# So searilize the model using joblib

from sklearn.externals import joblib
joblib.dump(knn_pred,'model.pkl')
#loading the model is called deserilization
k = joblib.load('model.pkl')


In [30]:
# Creating an API from a machine learning model using Flask
# For serving your model with Flask

# Load the already persisted model into memory when the application starts,
# Create an API endpoint that takes input variables, transforms them into the appropriate format,
## and returns predictions.

In [31]:
################################### JSON list of inputs
# #sample input to the API 
# [
#     {"Age": 45, "Sex": "Female", "Embarked": "C"},
#     {"Age": 64, "Sex": "male", "Embarked": "C"},
#   ]

################################### API Output could be
#{"prediction": [0, 1, 1, 0]}

In [32]:
#Let's write a function predict() which will do:

# Load the persisted model into memory when the application starts,
# Create an API endpoint that takes input variables, transforms them into the appropriate 
# format, and returns predictions.

# already seen how to load a persisted model. Now, you will focus on how you can use it for 
# predicting the survival status upon receiving inputs.


In [33]:
# The function that you wrote would only work under conditions where the incoming request contains 
# all possible values for the categorical variables which may or may not be the case in real-time.
# If the incoming request does not include all possible values of the categorical variables then 
# as per the current method definition of predict(), get_dummies() would generate a dataframe that
# has fewer columns than the classifier excepts, which would result in a runtime error.
# 
# To solve this problem, you will persist the list of columns during model training as well. 
# You can serialize any Python object into a .pkl file. You will use joblib in the same way
# as previously.

In [34]:
# it is always better to do all the server level coding in a text editor and then run it from a 
# terminal

In [35]:
model_columns = list(x.columns)
model_columns 

['Age', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Sex_female', 'Sex_male']

In [36]:
joblib.dump(model_columns,'model_columns.pkl')

['model_columns.pkl']

In [None]:

if __name__ == '__main__':
        # Model is loaded when the API is launched
    model = pickle.load(open('modelfile', 'rb'))
    app.run(debug=True)

In [40]:

from flask import Flask, jsonify
app = Flask(__name__)
@app.route('/predict', methods=['POST'])
def predict():
        json_ = request.json
        query_df = pd.DataFrame(json_)
        query = pd.get_dummies(query_df)
        prediction = lr.predict(query)
        return jsonify({'prediction': list(prediction)})

In [46]:
# # Your API endpoint URL would consist /predict
@app.route('/api/predict', methods=['POST'])
def predict():
    if lr:
        try:
            json_ = request.json
            query = pd.get_dummies(pd.DataFrame(json_))
            query = query.reindex(columns=model_columns, fill_value=0)

            prediction = list(lr.predict(query))

            return jsonify({'prediction': prediction})

        except:

            return jsonify({'trace': traceback.format_exc()})
    else:
        print ('Train the model first')
        return ('No model here to use')


# You included all the required elements in the "/predict" API, and now you just need to write the main class.

AssertionError: View function mapping is overwriting an existing endpoint function: predict

In [47]:
# You included all the required elements in the "/predict" API, and now you just need to write
# the main class.
if __name__ == '__main__':
    try:
        port = int(sys.argv[1]) # This is for a command-line argument
    except:
        port = 12345 # If you don't provide any port then the port will be set to 12345
    lr = joblib.load(model_file_name) # Load "model.pkl"
    print ('Model loaded')
    model_columns = joblib.load(model_columns_file_name) # Load "model_columns.pkl"
    print ('Model columns loaded')
    app.run(port=port, debug=True)

NameError: name 'model_file_name' is not defined