In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

# Sample dataset
data = {
    'Age': [25, 30, 35, 22, 28],
    'Gender': ['Male', 'Female', 'Male', 'Male', 'Female'],
    'Salary': [45000, 60000, 58000, 40000, 55000],
    'Purchased': ['Yes', 'No', 'Yes', 'No', 'Yes']
}

df = pd.DataFrame(data)

# 1. Handling Missing Data (if any)
# For simplicity, we assume there's no missing data in this example.

# 2. Encoding Categorical Variables
# Label encoding for 'Gender'
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# One-hot encoding for 'Purchased'
one_hot_encoder = OneHotEncoder()
purchased_encoded = one_hot_encoder.fit_transform(df[['Purchased']]).toarray()
df_encoded = pd.concat([df, pd.DataFrame(purchased_encoded, columns=[
                       'Purchased_No', 'Purchased_Yes'])], axis=1)
df_encoded = df_encoded.drop(columns=['Purchased'])

# 3. Scaling Numerical Features
scaler = StandardScaler()
df_encoded[['Age', 'Salary']] = scaler.fit_transform(
    df_encoded[['Age', 'Salary']])

# 4. Splitting Data into Training and Testing Sets
X = df_encoded.drop(columns=['Purchased_Yes'])
y = df_encoded['Purchased_Yes']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Now you can use X_train and y_train for training a machine learning model,
# and X_test and y_test for testing and evaluating the model.


Import Libraries:

sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder: Imports specific preprocessing tools from scikit-learn.

Data Loading:

The sample dataset is loaded into a pandas DataFrame named df. 
A DataFrame is a tabular data structure used for data manipulation and analysis.
H
andling Missing Data (if any):

The code includes a comment stating that it assumes there's no missing data. 
In a real-world scenario, you would need to handle missing data, which might involve imputation or removal of rows/columns with missing values.

Encoding Categorical Variables:

The 'Gender' column is encoded using label encoding with LabelEncoder. 
It converts categorical values ('Male' and 'Female') into numerical values (0 and 1), capturing the ordinal relationship between categories.

The 'Purchased' column is one-hot encoded using OneHotEncoder. It converts the categorical variable 'Purchased' into two binary columns ('Purchased_No' and 'Purchased_Yes') to represent whether a purchase was made or not.
Scaling Numerical Features:

The 'Age' and 'Salary' columns are scaled using StandardScaler. This standardization process ensures that both numerical features have similar scales, which is often important for many machine learning algorithms.
Splitting Data into Training and Testing Sets:

The dataset is split into training and testing sets using train_test_split from scikit-learn. 
This is a common practice to evaluate machine learning models. 
In this example, 80% of the data is used for training (X_train and y_train), and 20% is reserved for testing (X_test and y_test). The random_state parameter ensures reproducibility.

Final Comments:
The code concludes with comments explaining that the preprocessed data can now be used for training a machine learning model (X_train and y_train) and for testing and evaluating the model (X_test and y_test).

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Create and train a Logistic Regression model
classifier = LogisticRegression(random_state=42)
classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)

print("Classification Model (Logistic Regression) Results:")
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report_output)


Classification Model (Logistic Regression) Results:
Accuracy: 0.00
Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       1.0
         1.0       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


^^Logistic Regression
If your goal is to predict whether a customer will make a purchase ("Purchased_Yes" or "Purchased_No"), you can use classification models such as:


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Create and train a Linear Regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Make predictions on the test data
y_pred = regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Regression Model (Linear Regression) Results:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")


Regression Model (Linear Regression) Results:
Mean Squared Error (MSE): 0.00
R-squared (R2): nan




^^Linear Regression
If you want to predict a numerical value (e.g., predicting the amount spent by a customer based on age, gender, and salary), you can use regression models such as: