In [None]:
import pandas as pd
# Load the dataset
from google.colab import files
uploaded = files.upload()
data_train = pd.read_csv('train.csv')

from google.colab import files
uploaded = files.upload()
data_test = pd.read_csv('test.csv')


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the train data
train_data = pd.read_csv('train.csv')

# Separate the features and target variable
X_train = train_data.drop('price_range', axis=1)
y_train = train_data['price_range']

# Check the distribution of variables
X_train.hist(bins=50, figsize=(20,15))
plt.show()

# Correlation heatmap
corr_matrix = X_train.corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()

# Normalize the numerical features using StandardScaler
numerical_features = ['battery_power', 'clock_speed', 'fc', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores',
                      'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time']
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])

# One-hot encode the categorical features using OneHotEncoder
categorical_features = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
encoder = OneHotEncoder(drop='first', sparse=False)
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[categorical_features]))
X_train_encoded.columns = encoder.get_feature_names_out(categorical_features)
X_train = pd.concat([X_train.drop(categorical_features, axis=1), X_train_encoded], axis=1)

# Split the train dataset into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# Train and evaluate the logistic regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train_split, y_train_split)
y_pred_logistic_regression = logistic_regression_model.predict(X_val_split)

# Train and evaluate the random forest model
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train_split, y_train_split)
y_pred_random_forest = random_forest_model.predict(X_val_split)

# Train and evaluate the neural network model
neural_network_model = MLPClassifier()
neural_network_model.fit(X_train_split, y_train_split)
y_pred_neural_network = neural_network_model.predict(X_val_split)

# Calculate evaluation metrics
logistic_regression_accuracy = accuracy_score(y_val_split, y_pred_logistic_regression)
random_forest_accuracy = accuracy_score(y_val_split, y_pred_random_forest)
neural_network_accuracy = accuracy_score(y_val_split, y_pred_neural_network)

logistic_regression_precision = precision_score(y_val_split, y_pred_logistic_regression, average='weighted')
random_forest_precision = precision_score(y_val_split, y_pred_random_forest, average='weighted')
neural_network_precision = precision_score(y_val_split, y_pred_neural_network, average='weighted')

logistic_regression_recall = recall_score(y_val_split, y_pred_logistic_regression, average='weighted')
random_forest_recall = recall_score(y_val_split, y_pred_random_forest, average='weighted')
neural_network_recall = recall_score(y_val_split, y_pred_neural_network, average='weighted')

logistic_regression_f1_score = f1_score(y_val_split, y_pred_logistic_regression, average='weighted')
random_forest_f1_score = f1_score(y_val_split, y_pred_random_forest, average='weighted')
neural_network_f1_score = f1_score(y_val_split, y_pred_neural_network, average='weighted')

# Print the evaluation metrics for each model
models = ['Logistic Regression', 'Random Forest', 'Neural Network']
accuracy = [logistic_regression_accuracy, random_forest_accuracy, neural_network_accuracy]
precision = [logistic_regression_precision, random_forest_precision, neural_network_precision]
recall = [logistic_regression_recall, random_forest_recall, neural_network_recall]
f1_score = [logistic_regression_f1_score, random_forest_f1_score, neural_network_f1_score]

for i in range(3):
    print(f"{models[i]} Metrics:")
    print("Accuracy:", accuracy[i])
    print("Precision:", precision[i])
    print("Recall:", recall[i])
    print("F1 Score:", f1_score[i])
    print()

# Plot feature importances for models that have this attribute (e.g., Random Forest)
if hasattr(random_forest_model, 'feature_importances_'):
    importances = random_forest_model.feature_importances_
    indices = np.argsort(importances)
    features = X_train.columns
    plt.figure(figsize=(10, 7))
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel('Relative Importance')
    plt.show()

coefficients = logistic_regression_model.coef_[0]
coefficients = pd.Series(coefficients, index=X_train.columns)
coefficients = coefficients.sort_values(ascending=False)
plt.figure(figsize=(10, 7))
plt.title('Feature Importance (Logistic Regression)')
coefficients.plot(kind="bar")
plt.show()

from sklearn.inspection import permutation_importance

# Get importance
importance = permutation_importance(neural_network_model, X_val_split, y_val_split, scoring='accuracy', n_repeats=5, random_state=42)
# summarize feature importance
importances = importance.importances_mean
indices = np.argsort(importances)
features = X_train.columns
plt.figure(figsize=(10, 7))
plt.title('Feature Importance (Neural Network)')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Mean Importance')
plt.show()



In [None]:


# Further steps with the test set and making predictions...
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Make predictions on the validation set using the trained models
y_pred_logistic_regression_val = logistic_regression_model.predict(X_val_split)
y_pred_random_forest_val = random_forest_model.predict(X_val_split)
y_pred_neural_network_val = neural_network_model.predict(X_val_split)

# Create confusion matrices for each model
cm_logistic_regression = confusion_matrix(y_val_split, y_pred_logistic_regression_val)
cm_random_forest = confusion_matrix(y_val_split, y_pred_random_forest_val)
cm_neural_network = confusion_matrix(y_val_split, y_pred_neural_network_val)

# Define the class labels
class_labels = ['Low', 'Medium', 'High', 'Very High']

# Plot confusion matrices using seaborn
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Logistic Regression Confusion Matrix
sns.heatmap(cm_logistic_regression, annot=True, fmt=".2f", linewidths=.5, square=True, cmap='Blues',
            xticklabels=class_labels, yticklabels=class_labels, ax=axes[0])
axes[0].set_xlabel('Predicted label')
axes[0].set_ylabel('Actual label')
axes[0].set_title('Confusion Matrix for Logistic Regression Model')

# Random Forest Confusion Matrix
sns.heatmap(cm_random_forest, annot=True, fmt=".2f", linewidths=.5, square=True, cmap='Blues',
            xticklabels=class_labels, yticklabels=class_labels, ax=axes[1])
axes[1].set_xlabel('Predicted label')
axes[1].set_ylabel('Actual label')
axes[1].set_title('Confusion Matrix for Random Forest Model')

# Neural Network Confusion Matrix
sns.heatmap(cm_neural_network, annot=True, fmt=".2f", linewidths=.5, square=True, cmap='Blues',
            xticklabels=class_labels, yticklabels=class_labels, ax=axes[2])
axes[2].set_xlabel('Predicted label')
axes[2].set_ylabel('Actual label')
axes[2].set_title('Confusion Matrix for Neural Network Model')

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score

# Evaluate the models using cross-validation
logistic_regression_cv_scores = cross_val_score(logistic_regression_model, X_train, y_train, cv=5, scoring='accuracy')
random_forest_cv_scores = cross_val_score(random_forest_model, X_train, y_train, cv=5, scoring='accuracy')
neural_network_cv_scores = cross_val_score(neural_network_model, X_train, y_train, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Logistic Regression Cross-Validation Scores:")
print(logistic_regression_cv_scores)
print("Mean Accuracy:", logistic_regression_cv_scores.mean())
print()

print("Random Forest Cross-Validation Scores:")
print(random_forest_cv_scores)
print("Mean Accuracy:", random_forest_cv_scores.mean())
print()

print("Neural Network Cross-Validation Scores:")
print(neural_network_cv_scores)
print("Mean Accuracy:", neural_network_cv_scores.mean())
print()


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Box plots for each numeric variable
plt.figure(figsize=(20,10))
X_train[numerical_features].boxplot()
plt.title("Box plots for each numeric variable")
plt.show()

# Pair plots
sns.pairplot(X_train[numerical_features])
plt.show()

# ROC Curves for Model Evaluation
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

def plot_roc_curve(y_true, y_pred, model_name):
    lb = LabelBinarizer()
    lb.fit(y_true)
    y_true = lb.transform(y_true)
    y_pred = lb.transform(y_pred)

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(y_true.shape[1]):
        fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
        roc_auc[i] = roc_auc_score(y_true[:, i], y_pred[:, i])

    # Compute micro-average ROC curve and ROC area
    fpr["macro"], tpr["macro"], _ = roc_curve(y_true.ravel(), y_pred.ravel())
    roc_auc["macro"] = roc_auc_score(y_true, y_pred, average='micro')

    # Plot ROC curve
    plt.figure(figsize=(10,10))
    plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"]))
    for i in range(y_true.shape[1]):
        plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic for {}'.format(model_name))
    plt.legend(loc="lower right")
    plt.show()

plot_roc_curve(y_val_split, y_pred_logistic_regression, 'Logistic Regression')
plot_roc_curve(y_val_split, y_pred_random_forest, 'Random Forest')
plot_roc_curve(y_val_split, y_pred_neural_network, 'Neural Network')

    # 1)Box plots for each numeric variable: This will help you to visualize the distribution, median and outliers for each numeric variable.

    # 2)Pair plots: This will help you to visualize the relationship between each pair of numeric variables.

    # 3) ROC Curves for Model Evaluation: You can plot the receiver operating characteristic (ROC) curves of each model in the same plot.
    #    The area under the ROC curve (AUC-ROC) can be used as a metric to compare the performance of the models.

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Loading test data
data_test = pd.read_csv('test.csv')

# Separate the features in the test dataset (dropping 'id' as it's not a feature)
X_test = data_test.drop('id', axis=1)

# ... Rest of the preprocessing steps ...

# Normalize numerical features
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# One-hot encode categorical features
X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_features]))
X_test_encoded.columns = encoder.get_feature_names_out(categorical_features)
X_test = pd.concat([X_test.drop(categorical_features, axis=1), X_test_encoded], axis=1)

# Make predictions using the trained models
logistic_regression_predictions = logistic_regression_model.predict(X_test)
random_forest_predictions = random_forest_model.predict(X_test)
neural_network_predictions = neural_network_model.predict(X_test)

# Print the predictions
print("Logistic Regression Predictions:")
print(logistic_regression_predictions)

print("Random Forest Predictions:")
print(random_forest_predictions)

print("Neural Network Predictions:")
print(neural_network_predictions)
# Make predictions on the validation set using the trained models
y_pred_logistic_regression_val = logistic_regression_model.predict(X_val_split)
y_pred_random_forest_val = random_forest_model.predict(X_val_split)
y_pred_neural_network_val = neural_network_model.predict(X_val_split)

# Print the predictions
print("Logistic Regression Predictions for Validation Set:")
print(y_pred_logistic_regression_val)

print("Random Forest Predictions for Validation Set:")
print(y_pred_random_forest_val)

print("Neural Network Predictions for Validation Set:")
print(y_pred_neural_network_val)
