# **IMPORTING THE NECESSARY MODULES**

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from scipy.stats import zscore
import seaborn as sns
import matplotlib.pyplot as plt

# **DETECTING SKEWNESS AND OUTLIERS IN A DATA SET**

In [None]:
data = pd.read_csv("copper_data.csv")

# Detect skewness in the data
skewness = data.select_dtypes(include=np.number).apply(skew)
skewness = skewness[abs(skewness) > 0.5]
skewed_features = skewness.index

# Visualize the distribution of skewed features
skewed_features = skewness.index
for feature in skewed_features:
    sns.distplot(data[feature])
    plt.title(f"Distribution of {feature}")
    plt.show()

# Apply log transformation to skewed features
data[skewed_features] = np.log1p(data[skewed_features])

# Detect and handle outliers using z-score
z_scores = zscore(data.select_dtypes(include=np.number))
outliers = np.abs(z_scores) > 3  # Consider z-scores above a certain threshold
data_no_outliers = data[~outliers.any(axis=1)]

# Visualize box plots to compare the distribution before and after handling outliers
for feature in skewed_features:
    fig, ax = plt.subplots(1, 2, figsize=(10, 5))
    sns.boxplot(data=data, y=feature, ax=ax[0])
    sns.boxplot(data=data_no_outliers, y=feature, ax=ax[1])
    ax[0].set_title(f"Box Plot of {feature} (with outliers)")
    ax[1].set_title(f"Box Plot of {feature} (without outliers)")
    plt.show()

# **PREPROCESSING THE DATA SET**

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Separate the features (X) and target variable (y)
X = data.drop('selling_price', axis=1)
y = data['selling_price']

# Handling missing values (if any)
X = X.fillna(X.mean())
y = y.fillna(y.mean())
# Encode categorical variables
categorical_cols = X.select_dtypes(include=['string']).columns
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]))


In [64]:
# Splitting the data into training and testing sets

X_data = pd.DataFrame(X[["width","thickness"]])

X_train, X_test, y_train, y_test = train_test_split(X_data, y, test_size=0.2, random_state=42)

# **TRAINING THE MODEL**

In [65]:
# Train a linear regression model

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)


In [82]:
# Make predictions on the test set
y_pred = model.predict(X_test)


# **EVALUATING THE MODEL**

In [None]:
from sklearn.metrics import mean_squared_error,r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print('Root Mean Squared Error:', rmse)
print('R^2 Score:', r2)