In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("raw_ebay.csv")
df.head(10)

In [None]:
df.info() #Columns names, counts and data types

In [None]:
df.describe() #getting descriptive statistics

In [None]:
df.shape #count of columns and rows

In [None]:
df.isnull().sum() #how many absent rows

In [None]:
#value convertion
df['Screen_Size'] = pd.to_numeric(df['Screen_Size'], errors='coerce')
df['RAM'] = pd.to_numeric(df['RAM'], errors='coerce')

In [None]:
#filling the missing values
df['Screen_Size'] = df['Screen_Size'].fillna(df['Screen_Size'].mean())
df['RAM'] = df['RAM'].fillna(df['RAM'].mean())
df['Price'] = df['Price'].fillna(df['Price'].mean())

In [None]:
df.info()

In [None]:
# IQR calculating
Q1 = df['Price'].quantile(0.25)
Q3 = df['Price'].quantile(0.75)
IQR = Q3 - Q1

# Lower bound  and upper bound
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Lower bound: {lower_bound}, Upper bound: {upper_bound}")

In [None]:
#Boxplot visualization to show min - max value and outliers.
plt.figure(figsize=(6,4))
sns.boxplot(x=df['Price'])
plt.title("Price Dispersion Boxplot")
plt.show()

In [None]:
#looking for Q1 (the value under which 25% of data points) only (w/o IQR to see values under 25% of data)
Q1_manual = df['Price'].quantile(0.25)
print(f"Lower quartile (Q1) manuel calculating: {Q1_manual}")

In [None]:
#to filter outliers
df_filtered = df[(df['Price'] >= lower_bound) & (df['Price'] <= upper_bound)]
print(f"Clear data set shape: {df_filtered.shape}")

In [None]:
sns.histplot(df_filtered['Price'], bins=30, kde=True)
plt.title("Price Dispersion Boxplot")
plt.xlabel("Price")
plt.ylabel("Frequency ")
plt.show()

In [None]:
#Non numeric values
df_brand_avg = df.groupby("Brand")["Price"].mean().reset_index()

plt.figure(figsize=(20,6))
sns.barplot(x="Brand", y="Price", data=df_brand_avg)
plt.xticks(rotation=90)
plt.title("Brand-Based Median Prices")
plt.show()

In [None]:
# Select numeric values
df_numeric = df.select_dtypes(include='number')

# Correlation matrix calculation
correlation_matrix = df_numeric.corr()

# Visulization
plt.figure(figsize=(5,6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()



In [None]:
#avarage prices for brand, RAM and Scree-Size
pd.set_option('display.width', 200) 
print("=== Brand - Price ===")
print(df.groupby("Brand")["Price"].describe())

print("\n=== RAM - Price ===")
print(df.groupby("RAM")["Price"].describe())

print("\n=== Screen Size - Price ===")
print(df.groupby("Screen_Size")["Price"].describe())

In [None]:
#how many products of the brands are there?
df["Brand"].value_counts()

In [None]:
freq_table = df["Processor"].value_counts()
ratio_table = df["Processor"].value_counts(normalize=True) * 100

result = pd.DataFrame({
    "Frequences": freq_table,
    "Ratio (%)": ratio_table.round(2)
})
print(result)


In [None]:
#Train - Test

In [None]:
#Filling brand's missing values
df['Brand'] = df['Brand'].fillna(df['Brand'].mode()[0])
# One-hot encoding for categoric variables
df = pd.get_dummies(df, columns=['Brand'], drop_first=True)



In [None]:
print(df.columns)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Dependent variable
brand_cols = [col for col in df.columns if col.startswith('Brand_')] #first, call all dummies abour brand
X = df[brand_cols +['Screen_Size', 'RAM']]
#Target variable
y = df['Price']

# Train-test (before model train!)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model train
model = LinearRegression()
model.fit(X_train, y_train)

# Tahmin
y_pred = model.predict(X_test)

# Performans değerlendirme
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean squared error (MSE): {mse:.2f}")
print(f"R-squared value: {r2:.2f}")


In [None]:
# Model train
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Prediction and score
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest - MSE: {mse_rf:.2f}, R²: {r2_rf:.2f}")

In [None]:
#Comparing models
from sklearn.model_selection import learning_curve

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42)
}

X = df_model = df[['RAM', 'Screen_Size']]
y = df['Price']

plt.figure(figsize=(12, 6))

for name, model in models.items():
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y,
        cv=5,
        scoring='neg_mean_squared_error',  # veya 'r2'
        train_sizes=np.linspace(0.1, 1.0, 5),
        n_jobs=-1,
        shuffle=True,
        random_state=42
    )

    train_errors = -np.mean(train_scores, axis=1)
    test_errors = -np.mean(test_scores, axis=1)

    plt.plot(train_sizes, test_errors, 'o-', label=f'{name} Test MSE')
    plt.plot(train_sizes, train_errors, 'o--', label=f'{name} Train MSE')

plt.xlabel("Training Set Size")
plt.ylabel("MSE")
plt.title("Model Comparision and Learning Curves")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Feature importances
importances = rf_model.feature_importances_
feature_names = X.columns

# Sort by DataFrame 
feat_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
feat_importance_df = feat_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(8, 9))
sns.barplot(x='Importance', y='Feature', data=feat_importance_df, palette='viridis')
plt.title('Random Forest - Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Attributes')
plt.tight_layout()
plt.show()

In [None]:
#Model optimisation
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

# Parameter dist.
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.5]
}

# Model and Randomized Search
rf = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                   n_jobs=-1, verbose=2, random_state=42)

random_search.fit(X_train, y_train)

# Best fitted model and score
print("En iyi parametreler: ", random_search.best_params_)
best_rf_model = random_search.best_estimator_

# Test perform
from sklearn.metrics import mean_squared_error, r2_score

y_pred = best_rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Optimized Random Forest - MSE: {mse:.2f}, R²: {r2:.2f}")
