## Linear Regression Project

In [None]:

### Data Loading from SQL Database

import sqlite3
import pandas as pd

# Connect to the SQL database
conn = sqlite3.connect('shopify_data.db')

# Read the transformed data into a DataFrame
df = pd.read_sql_query('SELECT * FROM transformed_shopify_customers', conn)


### Data Exploration

In [None]:
import pandas as pd
file_path = '/mnt/data/customers_exportJul19.csv'
df = pd.read_csv(file_path)
df.head()

### Data Cleaning (data_transformation.py should already do this but this is here just for data science purposes)

In [None]:
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

In [None]:
df.drop(columns=['First Name', 'Last Name', 'Email', 'Company', 'Address2', 'Tags', 'Note', 'Phone', 'Customer ID'], inplace=True)

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
df[['Address1', 'City', 'Province', 'Country', 'Zip']] = imputer.fit_transform(df[['Address1', 'City', 'Province', 'Country', 'Zip']])

In [None]:
df.isnull().sum()

In [None]:
df.drop(columns=['Province Code', 'Country Code'], inplace=True)
df.isnull().sum()

### Feature Selection

In [None]:
from sklearn.preprocessing import LabelEncoder
categorical_columns = ['Accepts Email Marketing', 'Accepts SMS Marketing', 'Tax Exempt', 'Address1', 'City', 'Province', 'Country', 'Zip']
label_encoder = LabelEncoder()
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])
df.head()

### Data Hashing

In [None]:
import hashlib
def hash_column(column):
    return column.apply(lambda x: hashlib.sha256(str(x).encode('utf-8')).hexdigest())
hashed_df = df.apply(hash_column)
hashed_df.head()

In [None]:
hashed_csv_path = 'hashed_customers_data.csv'
hashed_df.to_csv(hashed_csv_path, index=False)

### Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
X = df.drop('Total Spent', axis=1)
y = df['Total Spent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
rmse

### Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
residuals = y_test - y_pred
sns.residplot(x=y_pred, y=residuals, lowess=True, color='g', line_kws={'color': 'red', 'lw': 1, 'alpha': 1})
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residuals vs Fitted Values')
plt.show()

In [None]:
feature_importance = pd.DataFrame({'Features': X.columns, 'Coefficients': model.coef_}).sort_values(by='Coefficients', ascending=False)
sns.barplot(x='Coefficients', y='Features', data=feature_importance)
plt.title('Feature Importance')
plt.show()