In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import joblib
import boto3
from io import StringIO


In [13]:
#Load data
s3_client = boto3.client('s3')
bucket_name = 'ml-ops-zenon'
file_key = 'Input/diabetes-dev-1.csv'

# Fetch the file from S3
response = s3_client.get_object(Bucket=bucket_name, Key=file_key)

# Read the file content into a Pandas DataFrame
csv_data = response['Body'].read().decode('utf-8')  # Decode the file content
data = pd.read_csv(StringIO(csv_data))  # Read the CSV data from the string

# Check the first few rows of the dataset
print(data.head())
# print(sklearn.__version__)

   Diabetic  Pregnancies  PlasmaGlucose  DiastolicBloodPressure  \
0         0            1             78                      41   
1         0            0            116                      92   
2         1            8            171                      42   
3         1            3            108                      63   
4         1            8            153                      99   

   TricepsThickness  SerumInsulin        BMI  DiabetesPedigree  Age  PatientID  
0                33           311  50.796392          0.420804   24    1142956  
1                16           184  18.603630          0.131156   22    1823377  
2                29           160  35.482247          0.082671   22    1916381  
3                45           297  49.375169          0.100979   46    1247480  
4                15            41  35.062139          0.116191   22    1516947  


In [14]:
#Split Model
X = data.drop(columns=["Diabetic", "PatientID"])
y = data["Diabetic"]
print(X)

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

      Pregnancies  PlasmaGlucose  DiastolicBloodPressure  TricepsThickness  \
0               1             78                      41                33   
1               0            116                      92                16   
2               8            171                      42                29   
3               3            108                      63                45   
4               8            153                      99                15   
...           ...            ...                     ...               ...   
4995           10             65                      60                46   
4996            2             73                      66                27   
4997            0             93                      89                43   
4998            0            132                      98                18   
4999            3            114                      65                47   

      SerumInsulin        BMI  DiabetesPedigree  Age  
0       

In [15]:

# 3. Define a pipeline for preprocessing and model training
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standardize the data
    ('classifier', RandomForestClassifier())  # Step 2: Train the model using RandomForest
])

# 4. Define a grid of hyperparameters for tuning
param_grid = {
    'classifier__n_estimators': [100, 200],  # Number of trees
    'classifier__max_depth': [10, 20, None],  # Maximum depth of each tree
    'classifier__min_samples_split': [2, 5],  # Minimum samples required to split an internal node
}

# 5. Perform GridSearchCV to tune hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# 6. Train the model using the training data
grid_search.fit(X_train, y_train)

# 7. Get the best model after hyperparameter tuning
best_model = grid_search.best_estimator_

# 8. Make predictions on the test set
y_pred = best_model.predict(X_test)

# 9. Evaluate the model performance
print(classification_report(y_test, y_pred))

# 10. Save the trained model to a file for future use
joblib.dump(best_model, 'random_forest_model.pkl')


              precision    recall  f1-score   support

           0       0.94      0.95      0.95       988
           1       0.91      0.88      0.89       512

    accuracy                           0.93      1500
   macro avg       0.92      0.92      0.92      1500
weighted avg       0.93      0.93      0.93      1500



['random_forest_model.pkl']

In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.metrics import mean_squared_error
# from sklearn import datasets
# from sklearn.model_selection import train_test_split
# from sklearn import metrics
# import joblib

# #Load data
# boston = datasets.load_boston()
# df = pd.DataFrame(boston.data, columns = boston.feature_names)
# df['MEDV'] = boston.target 

# #Split Model
# X = df.drop(['MEDV'], axis = 1) 
# y = df['MEDV']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

# #Model Creation
# lm = LinearRegression()
# lm.fit(X_train,y_train)


# with open('model.joblib', 'wb') as f:
#     joblib.dump(lm,f)


# with open('model.joblib', 'rb') as f:
#     predictor = joblib.load(f)

# print("Testing following input: ")
# print(X_test[0:1])
# sampInput = [[0.09178, 0.0, 4.05, 0.0, 0.51, 6.416, 84.1, 2.6463, 5.0, 296.0, 16.6, 395.5, 9.04]]
# print(type(sampInput))
# print(predictor.predict(sampInput))