In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score

In [34]:

# Load the dataset
csv_file_path = 'laptopData.csv'
data = pd.read_csv(csv_file_path)

# Selecting relevant features and target
features = ['Company', 'TypeName', 'Ram', 'Weight', 'Cpu', 'Gpu', 'ScreenResolution']
target = 'Price'

# Remove rows with missing values in the target variable 'Price'
data = data.dropna(subset=[target])

# Splitting the data into training and testing sets
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure all values in 'Cpu' column are strings and fill missing values
X_train['Cpu'] = X_train['Cpu'].astype(str).fillna('Unknown')
X_test['Cpu'] = X_test['Cpu'].astype(str).fillna('Unknown')

# Function to extract CPU brand and speed with error handling
def extract_cpu_features(cpu_info):
    parts = cpu_info.split()
    if len(parts) < 3:
        return pd.Series(['Unknown', 0])
    brand = parts[0] + ' ' + parts[1]
    try:
        speed = float(parts[-1].replace('GHz', ''))
    except ValueError:
        speed = 0
    return pd.Series([brand, speed])

# Function to extract GPU brand with error handling
def extract_gpu_brand(gpu_info):
    if not isinstance(gpu_info, str) or len(gpu_info.split()) < 1:
        return 'Unknown'
    return gpu_info.split()[0]

# Function to extract screen width and height with error handling
def extract_screen_resolution(screen_info):
    if not isinstance(screen_info, str) or 'x' not in screen_info:
        return pd.Series([0, 0])
    resolution = screen_info.split()[-1]
    width, height = resolution.split('x')
    return pd.Series([int(width), int(height)])

# Apply the functions to the respective columns
cpu_features_train = X_train['Cpu'].apply(extract_cpu_features)
cpu_features_train.columns = ['Cpu_Brand', 'Cpu_Speed']

cpu_features_test = X_test['Cpu'].apply(extract_cpu_features)
cpu_features_test.columns = ['Cpu_Brand', 'Cpu_Speed']


X_train = pd.concat([X_train, cpu_features_train], axis=1)
X_test = pd.concat([X_test, cpu_features_test], axis=1)


X_train['Gpu'] = X_train['Gpu'].astype(str).fillna('Unknown')
X_test['Gpu'] = X_test['Gpu'].astype(str).fillna('Unknown')


X_train['Gpu_Brand'] = X_train['Gpu'].apply(extract_gpu_brand)
X_test['Gpu_Brand'] = X_test['Gpu'].apply(extract_gpu_brand)


X_train['ScreenResolution'] = X_train['ScreenResolution'].astype(str).fillna('0x0')
X_test['ScreenResolution'] = X_test['ScreenResolution'].astype(str).fillna('0x0')

screen_resolution_features_train = X_train['ScreenResolution'].apply(extract_screen_resolution)
screen_resolution_features_train.columns = ['Screen_Width', 'Screen_Height']

screen_resolution_features_test = X_test['ScreenResolution'].apply(extract_screen_resolution)
screen_resolution_features_test.columns = ['Screen_Width', 'Screen_Height']

X_train = pd.concat([X_train, screen_resolution_features_train], axis=1)
X_test = pd.concat([X_test, screen_resolution_features_test], axis=1)

# Dropping original columns after feature extraction
X_train = X_train.drop(columns=['Cpu', 'Gpu', 'ScreenResolution'])
X_test = X_test.drop(columns=['Cpu', 'Gpu', 'ScreenResolution'])

# Updated features list
updated_features = ['Company', 'TypeName', 'Ram', 'Weight', 'Cpu_Brand', 'Cpu_Speed', 'Gpu_Brand', 'Screen_Width', 'Screen_Height']

# Label encoding for categorical features
label_encoders = {}
for feature in updated_features:
    if X_train[feature].dtype == 'object':
        le = LabelEncoder()
        combined_data = pd.concat([X_train[feature], X_test[feature]], axis=0)  # Combine train and test data for fitting
        le.fit(combined_data)
        X_train[feature] = le.transform(X_train[feature])
        X_test[feature] = le.transform(X_test[feature])
        label_encoders[feature] = le

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Applying Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_linear = linear_reg.predict(X_test)
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

# Applying Ridge Regression
ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)
y_pred_ridge = ridge_reg.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

mse_linear, r2_linear, mse_ridge, r2_ridge


(659424294.4355531, 0.5454374233655623, 659522677.9240884, 0.5453696044325563)

In [35]:
y_pred_linear[:10]

array([61248.89263421, 97476.7197198 , 71413.25841409, 41451.02223329,
       26450.87987425, 71814.97373607, 64783.38655864, 50648.13607707,
       69252.86484457, 70431.90605437])

In [37]:
y_test[:10]

44       53226.720
1188     58554.720
133      30476.160
1262     24455.520
1222     28185.120
711      58021.920
1160     79866.720
881      36486.144
546      41824.800
800     110017.872
Name: Price, dtype: float64