<a href="https://colab.research.google.com/github/Dollyp1205/Marketing-Research-Methods---Dolly-Poddar--Jio-Institute/blob/Dolly/Logistics_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
dataset_path = '/content/_autos - regression.xlsx'
df = pd.read_excel(dataset_path)

# Handle missing values
df.fillna({'vehicleType': 'unknown', 'model': 'unknown', 'notRepairedDamage': 'nein'}, inplace=True)

# Drop irrelevant columns, including datetime columns that may cause dtype issues
df.drop(['dateCrawled', 'name', 'dateCreated', 'lastSeen'], axis=1, inplace=True)

# Convert categorical variables to dummy variables
df_encoded = pd.get_dummies(df, columns=['seller', 'offerType', 'abtest', 'vehicleType', 'gearbox', 'fuelType', 'brand', 'notRepairedDamage', 'model'], drop_first=True)

# Define independent variables and target variable
X = df_encoded.drop(['price'], axis=1)
y = df_encoded['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Model assumptions:
# 1. Linear relationship between independent variables and the target variable.
# 2. Homoscedasticity: Constant variance of the residuals.
# 3. Independence of residuals.
# 4. Normality of residuals.
# 5. No multicollinearity among independent variables.

# Note: Further statistical tests (e.g., Durbin-Watson, VIF) may be conducted to check assumptions.

Mean Squared Error: 304974542676.9286
R-squared: -0.04824780621662894


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

# Load the dataset
dataset_path = '/content/_autos - regression.xlsx'
df = pd.read_excel(dataset_path)

# Handle missing values
df.fillna({'vehicleType': 'unknown', 'model': 'unknown', 'notRepairedDamage': 'nein'}, inplace=True)

# Drop irrelevant columns, including datetime columns that may cause dtype issues
df.drop(['dateCrawled', 'name', 'dateCreated', 'lastSeen'], axis=1, inplace=True)

# Convert categorical variables to dummy variables
df_encoded = pd.get_dummies(df, columns=['seller', 'offerType', 'abtest', 'vehicleType', 'gearbox', 'fuelType', 'brand', 'notRepairedDamage', 'model'], drop_first=True)

# Ensure all columns are numeric
df_encoded = df_encoded.apply(pd.to_numeric, errors='coerce')

# Check for null values after conversion
if df_encoded.isnull().any().any():
    print("Null values found, replacing with 0...")
    df_encoded.fillna(0, inplace=True)

# Define independent variables and target variable
X = df_encoded.drop(['price'], axis=1)
y = df_encoded['price']

# Ensure X and y are numeric
X = X.astype(float)
y = y.astype(float)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression using sklearn
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Linear Regression Results")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# OLS Regression using statsmodels
X_ols = sm.add_constant(X)  # Add a constant term
ols_model = sm.OLS(y, X_ols).fit()

print("\nOLS Regression Results")
print(ols_model.summary())

Linear Regression Results
Mean Squared Error: 304974542676.9286
R-squared: -0.04824780621662894

OLS Regression Results
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.8769
Date:                Sun, 09 Feb 2025   Prob (F-statistic):              0.944
Time:                        09:48:17   Log-Likelihood:            -6.1345e+06
No. Observations:              371527   AIC:                         1.227e+07
Df Residuals:                  371211   BIC:                         1.227e+07
Df Model:                         315                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.97

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score
import statsmodels.api as sm

# Load the dataset
dataset_path = '/content/bank marketing - logistic.xlsx'
df = pd.read_excel(dataset_path)

# Identify categorical variables and convert them to dummy variables
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Define independent variables and target variable for logistic regression
X_logistic = df_encoded.drop(['y'], axis=1)
y_logistic = df_encoded['y'].apply(lambda x: 1 if x == 'yes' else 0)  # Convert target variable to binary

# Split the data into training and testing sets for logistic regression
X_train_logistic, X_test_logistic, y_train_logistic, y_test_logistic = train_test_split(X_logistic, y_logistic, test_size=0.2, random_state=42)

# Create and train the logistic regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_logistic, y_train_logistic)

# Make predictions for logistic regression
y_pred_logistic = logistic_model.predict(X_test_logistic)

# Evaluate the logistic regression model
logistic_accuracy = accuracy_score(y_test_logistic, y_pred_logistic)
logistic_conf_matrix = confusion_matrix(y_test_logistic, y_pred_logistic)
logistic_class_report = classification_report(y_test_logistic, y_pred_logistic)

print("Logistic Regression Results")
print("Accuracy:", logistic_accuracy)
print("Confusion Matrix:\n", logistic_conf_matrix)
print("Classification Report:\n", logistic_class_report)

# Define independent variables and target variable for OLS regression
X_ols = df_encoded.drop(['y'], axis=1)
y_ols = df_encoded['y'].apply(lambda x: 1 if x == 'yes' else 0)  # Use binary target for consistency

# Add a constant term for OLS regression
X_ols = sm.add_constant(X_ols)

# Fit the OLS model
ols_model = sm.OLS(y_ols, X_ols).fit()

# Print OLS regression results
print("\nOLS Regression Results")
print(ols_model.summary())

Logistic Regression Results
Accuracy: 0.8961325966850828
Confusion Matrix:
 [[787  20]
 [ 74  24]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.98      0.94       807
           1       0.55      0.24      0.34        98

    accuracy                           0.90       905
   macro avg       0.73      0.61      0.64       905
weighted avg       0.87      0.90      0.88       905



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).