In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv('/content/preprocessed.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/preprocessed.csv'

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.habitable.value_counts()

In [None]:
df.star_type.value_counts()

### Splitting data into train test

In [None]:
X = df.drop('habitable', axis=1)
y = df['habitable']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape , X_test.shape


In [None]:
categorical_column = ['star_type']

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit and transform on X_train
X_train_encoded = encoder.fit_transform(X_train[categorical_column])
# Transform on X_test
X_test_encoded = encoder.transform(X_test[categorical_column])

# Get feature names for the new encoded columns
encoded_feature_names = encoder.get_feature_names_out(categorical_column)

# Create DataFrames from the encoded arrays
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_feature_names, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_feature_names, index=X_test.index)

# Drop the original categorical column from X_train and X_test
X_train = X_train.drop(columns=categorical_column)
X_test = X_test.drop(columns=categorical_column)

# Concatenate the original numerical columns with the new encoded categorical columns
X_train = pd.concat([X_train, X_train_encoded_df], axis=1)
X_test = pd.concat([X_test, X_test_encoded_df], axis=1)

In [None]:
X_train.head()

In [None]:
X_train.shape

## Training Random Forest

In [None]:
model = RandomForestClassifier(n_estimators = 100 )
model.fit(X_train,y_train)

## acuuracy test

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy_score(y_test,model.predict(X_test))

1.0

### Training Logistic regressor model

In [None]:
model_log =  LogisticRegression()
model_log.fit(X_train,y_train)
y_pred = model_log.predict(X_test)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
accuracy_score(y_test,y_pred)

0.9618733113179225

## Tunning Model using gridsearch cv on Random Forest

# Task
Train an XGBoost classifier model using `X_train` and `y_train`, evaluate its accuracy on `X_test` and `y_test`, and then save the trained model.

## Import XGBoost

### Subtask:
Import the XGBoost library into the environment.


**Reasoning**:
The subtask requires importing the XGBoost library, specifically `XGBClassifier`. I will use a code block to import it.



In [None]:
from xgboost import XGBClassifier

## Train XGBoost Model

### Subtask:
Initialize and train an XGBoost classifier model using the preprocessed `X_train` and `y_train` data.


**Reasoning**:
I need to initialize an `XGBClassifier` and then train it using the `X_train` and `y_train` datasets as instructed.



In [None]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
print("XGBoost model trained successfully.")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost model trained successfully.


**Reasoning**:
The previous code generated a UserWarning because the `use_label_encoder` parameter is deprecated in the current version of XGBoost. I will remove this parameter to avoid the warning and ensure cleaner code execution.



In [None]:
xgb_model = XGBClassifier(eval_metric='logloss')
xgb_model.fit(X_train, y_train)
print("XGBoost model trained successfully.")

XGBoost model trained successfully.


**Reasoning**:
Now that the XGBoost model is trained, I need to evaluate its performance by making predictions on the test set (`X_test`) and calculating the accuracy score using `y_test`.



In [None]:
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Model Accuracy: {accuracy_xgb}")

XGBoost Model Accuracy: 0.999699789852897


## Save the Trained XGBoost Model

### Subtask:
Save the trained XGBoost model to a file.

**Reasoning**:
To save the trained XGBoost model, I need to use a serialization library like `joblib`. First, I will import `joblib` to make its functions available.



In [None]:
import joblib
print("joblib imported successfully.")

joblib imported successfully.


**Reasoning**:
Now that joblib is imported, I will use `joblib.dump()` to save the trained `xgb_model` to a file named 'xgb_model.pkl'.



In [None]:
joblib.dump(xgb_model, 'xgb_model.pkl')
print("XGBoost model saved successfully to xgb_model.pkl")

XGBoost model saved successfully to xgb_model.pkl


## Final Task

### Subtask:
Confirm the completion of the XGBoost model training, evaluation, and saving process.


## Summary:

### Data Analysis Key Findings

*   The `XGBClassifier` was successfully imported and initialized with `eval_metric='logloss'`.
*   The XGBoost model was trained on `X_train` and `y_train`. An initial `UserWarning` regarding the deprecated `use_label_encoder` parameter was encountered and subsequently resolved by removing the parameter from the initialization.
*   The trained XGBoost model achieved a high accuracy of approximately 0.9997 when evaluated on the `X_test` dataset.
*   The `joblib` library was used to successfully save the trained XGBoost model to a file named `xgb_model.pkl`.

### Insights or Next Steps

*   The trained XGBoost model demonstrates excellent predictive performance, suggesting it is highly effective for the given classification task.
*   The saved model (`xgb_model.pkl`) can be readily deployed for future inference, avoiding the need for retraining.
