# Bismillah

In [1]:
import numpy as np
import pandas as pd
import pickle 


In [2]:
df = pd.read_csv('adult_data.csv')

In [9]:
pipeline = pickle.load(open('pipeline.pkl', 'rb'))

## 4. Split the Data

- Divide your data into training and testing sets (commonly a 70-30 or 80-20 split). This helps in evaluating the model on unseen data to gauge its generalization capability.


In [4]:
from sklearn.model_selection import train_test_split
ndf=df
X=df.drop(columns=['income'])
y=df['income']

In [5]:
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Transform the dataset

In [10]:
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

## 5. Choose a Model
-     Classification: Logistic Regression, Decision Trees, SVM, Random Forest, Gradient Boosting Machines, Neural Networks.
-    Regression: Linear Regression, Ridge, Lasso, SVR, Random Forest Regressor, Gradient Boosting Regressor.

In [7]:
from sklearn.linear_model import LogisticRegressionCV

In [8]:
model = LogisticRegressionCV(cv=5, random_state=42, Cs=10)


## 6. Train the Model

- Train your model using the training data. This involves fitting the model to the data and adjusting parameters. Use cross-validation to ensure that the model generalizes well over different subsets of the data.


In [13]:
model.fit(X_train_transformed, y_train.values.ravel())

LogisticRegressionCV(cv=5, random_state=42)

In [15]:
# Predict on the testing data

y_pred = model.predict(X_test_transformed)

## 7. Model Evaluation

- Evaluate the model using appropriate metrics:

   - Classification: Accuracy, Precision, Recall, F1-Score, ROC-AUC.
   - Regression: MSE, RMSE, MAE, R².

In [16]:
from sklearn.metrics import accuracy_score

In [19]:
# Calculate the accuracy

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')



Accuracy: 76.54%


## 8. Model Refinement

- Refine the model based on evaluation metrics. This may involve returning to feature engineering, trying different models, or tuning hyperparameters further.


In [20]:

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier


In [21]:
from sklearn.model_selection import GridSearchCV

In [33]:
models = {
                "Random Forest": RandomForestClassifier(),
                "Decision Tree": DecisionTreeClassifier(),
                "Gradient Boosting": GradientBoostingClassifier(),
                "AdaBoost": AdaBoostClassifier(),
            } 

params={
                "Decision Tree": {
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                    # 'splitter':['best','random'],
                    # 'max_features':['sqrt','log2'],
                },
                "Random Forest":{
                    # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                 
                    # 'max_features':['sqrt','log2',None],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Gradient Boosting":{
                    # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
                    'learning_rate':[.1,.01,.05,.001],
                    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                    # 'criterion':['squared_error', 'friedman_mse'],
                    # 'max_features':['auto','sqrt','log2'],
                    'n_estimators': [8,16,32,64,128,256]
                },
                
                "AdaBoost":{
                    'learning_rate':[.1,.01,0.5,.001],
                    # 'loss':['linear','square','exponential'],
                    'n_estimators': [8,16,32,64,128,256]
                }
                
            }

In [44]:
def evaluate_models(X_train, y_train,X_test,y_test,models,param):
    try:
        report = {}

        for i in range(len(list(models))):
            
            model = list(models.values())[i]
            para=param[list(models.keys())[i]]

            model.fit(X_train_transformed,y_train)

            #model.fit(X_train, y_train)  # Train model

            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test_transformed)
            
            train_model_score = accuracy_score(y_train, y_train_pred)
            test_model_score = accuracy_score(y_test, y_test_pred)

            report[list(models.keys())[i]] = test_model_score          
    
    except Exception as e:
        raise Exception
    
    return report

In [45]:
model_report:dict=evaluate_models(X_train=X_train_transformed,y_train=y_train,X_test=X_test_transformed,y_test=y_test,models=models,param=params)

In [47]:
model_report

{'Random Forest': 0.8265189770320899,
 'Decision Tree': 0.7752076885486235,
 'Gradient Boosting': 0.8442743117771624,
 'AdaBoost': 0.8392246294184721}

In [54]:
best_accuracy = sorted(model_report.values())[-1]
print(best_accuracy)

0.8442743117771624


In [57]:
def find_keys_by_value(dictionary, target_value):
    return [key for key, value in dictionary.items() if value == target_value]


In [63]:
best_model = find_keys_by_value(model_report, best_accuracy)
best_model[0]

'Gradient Boosting'

In [65]:
best_model_object = models[best_model[0]]

##### Make a pickle file of 'Best Model'

In [70]:
pickle.dump(best_model_object,open('best_model_object.pkl', 'wb'))

## 9. Model Deployment

- Once satisfied with the model's performance, deploy it to a production environment. This could be through a REST API, a web application, or by integrating it into existing software systems.



## 10. Making Predictions

- Use the deployed model to make predictions on new data. Ensure that the new data is preprocessed and transformed in the same way as the training data.

In [68]:
predicted = best_model_object.predict(X_test_transformed)

In [69]:
accuracy_score(y_test,predicted)

0.8442743117771624



## 11. Monitor and Update

- Regularly monitor the model's performance as it might degrade over time (concept drift). Update the model periodically by retraining it with new data or tweaking it to maintain accuracy.

## 12. Documentation and Reporting

- Throughout the process, document your findings, methodology, model parameters, and performance metrics. This is crucial for reproducibility and for stakeholders to understand the model's capabilities and limitations.