<a href="https://colab.research.google.com/github/Drozeler/Machine-Learning/blob/main/Classified.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Load your dataset into a DataFrame (assuming the dataset is in a CSV file)
df = pd.read_csv('Social_Network_Ads.csv')

# Define your feature matrix X and target vector y
X = df[['Age', 'EstimatedSalary']]
y = df['Purchased']

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=76)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=76)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

# Define which features should be scaled
numeric_features = ['Age', 'EstimatedSalary']

# Create a ColumnTransformer to scale numeric features and keep other features unchanged
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ])

# Create a pipeline that preprocesses the data and trains a K-Nearest Neighbor (KNN) model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))  # You can adjust the number of neighbors
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy * 100, "%")
print("Precision:", precision * 100, "%")
print("Recall:", recall * 100, "%")
print("F1 Score:", f1 * 100, "%")


Accuracy: 92.23300970873787 %
Precision: 88.88888888888889 %
Recall: 96.0 %
F1 Score: 92.3076923076923 %


In [None]:
# Calculate the percentage of data labeled as 'Purchased'
percentage_data_purchased = (y_test == 1).mean() * 100

# Calculate the percentage of model's predictions labeled as 'Purchased'
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]  # Predict probabilities for class 'Purchased'
threshold = 0.5  # Adjust the threshold if needed
predicted_purchased_indices = y_pred_prob >= threshold
percentage_model_purchased = predicted_purchased_indices.mean() * 100

print("Percentage of data labeled as 'Purchased':", percentage_data_purchased, "%")
print("Percentage of model's predictions labeled as 'Purchased':", percentage_model_purchased, "%")


Percentage of data labeled as 'Purchased': 48.54368932038835 %
Percentage of model's predictions labeled as 'Purchased': 52.42718446601942 %


In [None]:
import numpy as np

# Create a DataFrame with new customer data
new_data = pd.DataFrame({
    'Age': [31],  # Replace with the age of the new customer
    'EstimatedSalary': [190000]  # Replace with the estimated salary of the new customer
})

# Preprocess the new data using the same preprocessor for scaling
new_data_scaled = preprocessor.transform(new_data)

# Make a prediction on the new scaled data using the pipeline
new_prediction = pipeline.named_steps['classifier'].predict(new_data_scaled)

if new_prediction[0] == 1:
    print("Purchased")
else:
    print("Not Purchased")


Purchased


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load your dataset into a DataFrame (assuming the dataset is in a CSV file)
df = pd.read_csv('Social_Network_Ads.csv')

# Define your feature matrix X and target vector y
X = df[['Age', 'EstimatedSalary']]
y = df['Purchased']

# Initialize variables to store the best random state and its corresponding accuracy
best_random_state = None
best_accuracy = 0.0

# Iterate over different random states and find the best one
for random_state in range(1, 101):  # You can adjust the range as needed
    # Split data into training and testing sets with the current random state
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    # Define which features should be scaled
    numeric_features = ['Age', 'EstimatedSalary']

    # Create a ColumnTransformer to scale numeric features and keep other features unchanged
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features)
        ])

    # Create a pipeline that preprocesses the data and trains a Support Vector Machine (SVM) model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', SVC(C=1, kernel='linear'))  # Experiment with different values of C and kernel
    ])

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = pipeline.predict(X_test)

    # Calculate accuracy for the current random state
    accuracy = accuracy_score(y_test, y_pred)

    # Check if the current random state gives a higher accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_random_state = random_state

# Print the best random state and its corresponding accuracy
print(f"Best Random State: {best_random_state}")
print(f"Best Accuracy: {best_accuracy * 100:.2f}%")


Best Random State: 61
Best Accuracy: 92.50%


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load your dataset into a DataFrame (assuming the dataset is in a CSV file)
df = pd.read_csv('Social_Network_Ads.csv')

# Define your feature matrix X and target vector y
X = df[['Age', 'EstimatedSalary']]
y = df['Purchased']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define which features should be scaled
numeric_features = ['Age', 'EstimatedSalary']

# Create a ColumnTransformer to scale numeric features and keep other features unchanged
preprocessor = StandardScaler()

# List of classifiers to test
classifiers = [
    ('Logistic Regression', LogisticRegression(C=1)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('Support Vector Machine', SVC(C=1, kernel='linear', random_state=42)),
    ('K-Nearest Neighbor', KNeighborsClassifier()),
    ('Neural Network', MLPClassifier())
    # Add more classifiers here...
]

results = []

for clf_name, classifier in classifiers:
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = pipeline.predict(X_test)

    # Calculate and print evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append((clf_name, accuracy, precision, recall, f1))

# Display the results
for result in results:
    clf_name, accuracy, precision, recall, f1 = result
    print(f'Classifier: {clf_name}')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print('='*50)


Classifier: Logistic Regression
Accuracy: 0.8625
Precision: 0.9048
Recall: 0.6786
F1 Score: 0.7755
Classifier: Random Forest
Accuracy: 0.8875
Precision: 0.8065
Recall: 0.8929
F1 Score: 0.8475
Classifier: Gradient Boosting
Accuracy: 0.8625
Precision: 0.8148
Recall: 0.7857
F1 Score: 0.8000
Classifier: Support Vector Machine
Accuracy: 0.8625
Precision: 0.9048
Recall: 0.6786
F1 Score: 0.7755
Classifier: K-Nearest Neighbor
Accuracy: 0.9125
Precision: 0.8621
Recall: 0.8929
F1 Score: 0.8772
Classifier: Neural Network
Accuracy: 0.9500
Precision: 0.9000
Recall: 0.9643
F1 Score: 0.9310




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load your dataset into a DataFrame (assuming the dataset is in a CSV file)
df = pd.read_csv('Social_Network_Ads.csv')

# Define your feature matrix X and target vector y
X = df[['Age', 'EstimatedSalary']]
y = df['Purchased']

# Initialize variables to store the best classifier, scaler, and random state
best_classifier = None
best_scaler = None
best_random_state = None
best_accuracy = 0.0

# Define the classifiers, scalers, and random states to test
classifiers = [
    ('Logistic Regression', LogisticRegression()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('K-Nearest Neighbor', KNeighborsClassifier()),
    ('Neural Network', MLPClassifier()),
    ('Support Vector Machine', SVC())
]

scalers = [
    ('StandardScaler', StandardScaler()),
    ('MinMaxScaler', MinMaxScaler()),
    ('RobustScaler', RobustScaler())
]

# Iterate through different combinations of classifiers, scalers, and random states
for clf_name, classifier in classifiers:
    for scaler_name, scaler in scalers:
        for random_state in range(1, 101):  # Adjust the range of random states as needed
            # Split data into training and testing sets with the current random state
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

            # Create a pipeline with the current scaler and classifier
            pipeline = Pipeline([
                ('scaler', scaler),
                ('classifier', classifier)
            ])

            # Fit the pipeline on the training data
            pipeline.fit(X_train, y_train)

            # Make predictions on the test data
            y_pred = pipeline.predict(X_test)

            # Calculate accuracy for the current combination
            accuracy = accuracy_score(y_test, y_pred)

            # Check if the current combination has a higher accuracy
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_classifier = clf_name
                best_scaler = scaler_name
                best_random_state = random_state

# Print the best combination of classifier, scaler, and random state
print(f"Best Classifier: {best_classifier}")
print(f"Best Scaler: {best_scaler}")
print(f"Best Random State: {best_random_state}")
print(f"Best Accuracy: {best_accuracy * 100:.2f}%")




Best Classifier: K-Nearest Neighbor
Best Scaler: StandardScaler
Best Random State: 76
Best Accuracy: 97.50%


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'n_neighbors': range(1, 31),  # Example range for n_neighbors
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Create KNN classifier
knn = KNeighborsClassifier()

# Create grid search
grid_search = GridSearchCV(
    knn, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1
)

# Fit the grid search on your data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and their accuracy
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)


Best Hyperparameters: {'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
Best Accuracy: 0.821875
