# Model Notebook for sk learn


In [3]:
import pandas as pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.linear_model import LinearRegression

In [4]:
# import the data
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [10]:
# seprate the column features x and target /labels y
X=df[['total_bill']]
Y=df['tip']

In [21]:
# train and test split the data
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.8)

In [12]:
# preprocess the data  its up to u if u want to do scaling without scaling in this results coming better
# scalar =StandardScaler()
# X_train =scalar.fit_transform(X_train)
# X_test =scalar.transform(X_test)

In [22]:
# call the model
model = LinearRegression()

# train the model
model.fit(X_train, Y_train)

In [23]:
# predict the model
y_predict=model.predict(X_test)

In [24]:
model.predict([[30]])



array([4.0110253])

In [20]:
# evaluate the model because tip in not coming on basis of bill
from sklearn.metrics import mean_squared_error,r2_score
print('MSE: ', mean_squared_error(Y_test,y_predict))
print('R2: ', r2_score(Y_test,y_predict))


MSE:  0.8561948300175423
R2:  0.3100078658689477


In [29]:
# Save the model
import pickle
pickle.dump(model, open('./saved_models/01_model.pkl', 'wb'))  # Use quotes around the filename

In [31]:
# load the model
import pickle
model_load= pickle.load(open('./saved_models/01_model.pkl','rb'))

In [32]:
# now predict from the model we created and imported
model_load.predict([[40]])

# so result is if customers pays $ bill then tip will be $ 5



array([5.00584712])

# steps of ml model

# 
    1.Define the Problem: Clearly articulate the specific problem you want to solve with machine learning.
    2. Collect Data: Gather relevant and representative data from various sources.
    3. Data Preprocessing: Clean and transform the data to make it suitable for analysis.
    4. Exploratory Data Analysis (EDA): Analyze the data visually and statistically to uncover patterns and insights.
    5. Split the Data: Divide the dataset into training, validation, and test sets for model evaluation.
    6. Select a Model: Choose an appropriate machine learning algorithm based on the problem type.
    7. Train the Model: Fit the selected model to the training data to learn patterns.
    8. Evaluate the Model: Assess model performance using metrics on the validation dataset.
    9. Hyperparameter Tuning: Optimize model performance by adjusting hyperparameters.
    10. Test the Model: Evaluate the final model on the test set to ensure generalization.
    11. Deployment: Deploy the trained model into a production environment.
    12. Monitor and Maintain: Continuously track model performance and update as necessary.


# Machine Learning Workflow Steps

# 1. Define the Problem
# Problem: Predict house prices based on features.

# 2. Collect Data
import pandas as pd
data = pd.read_csv('housing_data.csv')  # Load data from a CSV file.

# 3. Data Preprocessing
data.fillna(data.mean(), inplace=True)  # Fill missing values with the mean.

# 4. Exploratory Data Analysis (EDA)
import seaborn as sns
sns.pairplot(data)  # Visualize relationships between features.

# 5. Split the Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop('price', axis=1), data['price'], test_size=0.2)

# 6. Select a Model
from sklearn.linear_model import LinearRegression
model = LinearRegression()  # Choose a linear regression model.

# 7. Train the Model
model.fit(X_train, y_train)  # Train the model on the training data.

# 8. Evaluate the Model
from sklearn.metrics import mean_squared_error
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)  # Calculate mean squared error.

# 9. Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
params = {'fit_intercept': [True, False]}
grid_search = GridSearchCV(LinearRegression(), params)
grid_search.fit(X_train, y_train)  # Optimize hyperparameters.

# 10. Test the Model
final_predictions = grid_search.predict(X_test)  # Test on the test set.

# 11. Deployment
import joblib
joblib.dump(model, 'house_price_model.pkl')  # Save the model for deployment.

# 12. Monitor and Maintain
# Implement logging or monitoring to track model performance in production.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.externals import joblib  # For saving the model

# 1. Load Data
df = pd.read_csv('path_to_data.csv')  # Replace with your dataset path

# 2. Preprocess Data
df = df.dropna()  # Drop missing values
X = df.drop('target_column', axis=1)  # Features (replace 'target_column' with your target column name)
y = df['target_column']  # Target

# Scale features (important for many models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Split Data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# 4. Train Model (Logistic Regression example)
model = LogisticRegression()
model.fit(X_train, y_train)

# 5. Predict
y_pred = model.predict(X_test)

# 6. Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# 7. Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# 8. Additional Metrics: Precision, Recall, F1-Score
# You can extract these from the classification report, but here's how to calculate them manually:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

# 9. ROC Curve and AUC (for binary classification)
# Only works for binary classification, not multi-class
if len(np.unique(y_test)) == 2:  # Check if binary classification
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    roc_auc = auc(fpr, tpr)
    
    print(f"ROC AUC: {roc_auc:.2f}")
    
    # Plot ROC Curve
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

# 10. Visualizing Original vs Predicted (Assuming 2D for simplicity)
if X_train.shape[1] == 2:  # Check if we have 2 features for visualization
    # Visualize original data (test set)
    plt.figure(figsize=(10, 6))
    plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap='coolwarm', label='Original Data')
    plt.title("Original Data (Test Set)")
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.show()

    # Visualize predicted data (test set predictions)
    plt.figure(figsize=(10, 6))
    plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred, cmap='coolwarm', marker='x', label='Predicted Data')
    plt.title("Predicted Data (Test Set)")
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.show()

# 11. Save the Model
joblib.dump(model, 'logistic_regression_model.pkl')  # Save model as a .pkl file

# 12. Test the Model with New Data
new_data = np.array([[5.1, 3.5]])  # New data point (replace with actual data)
new_data_scaled = scaler.transform(new_data)  # Don't forget to scale new data
new_prediction = model.predict(new_data_scaled)
print(f"Prediction for new data: {new_prediction[0]}")
