In [145]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

In [146]:
# Load your dataset
data = pd.read_csv('phone_data_cleaned.csv')

In [147]:
data['screen_size'].head()

0    6.70
1    6.59
2    6.60
3    6.55
4    6.70
Name: screen_size, dtype: float64

In [148]:
# Select the relevant features and target variable
features = data[['has_5g', 'ram_amount', 'inbuilt_storage', 'battery_life', 'charging_speed', 'primary_camera_resolution', 'clock_speed_ghz']]
target = data['rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [149]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = XGBRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [150]:
# Get feature importances
importances = model.feature_importances_
feature_names = features.columns
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values(by='importance', ascending=False)

# Normalize feature importances to use as weights
total_importance = sum(feature_importance_df['importance'])
feature_importance_df['normalized_importance'] = feature_importance_df['importance'] / total_importance

# Convert feature importances to a dictionary of weights
weights = dict(zip(feature_importance_df['feature'], feature_importance_df['normalized_importance']))

print("Feature Importances:")
print(feature_importance_df)

# Assigning weights based on manual assessment
manual_weights = {
    'battery_life': 0.25,
    'primary_camera_resolution': 0.20,
    'ram_amount': 0.15,
    'charging_speed': 0.15,
    'inbuilt_storage': 0.10,
    'clock_speed_ghz': 0.10,
    'has_5g': 0.05
}

print("Manual Weights:")
for feature, weight in manual_weights.items():
    print(f"{feature}: {weight}")


Feature Importances:
                     feature  importance  normalized_importance
1                 ram_amount    0.656531               0.656531
5  primary_camera_resolution    0.139581               0.139581
2            inbuilt_storage    0.054993               0.054993
6            clock_speed_ghz    0.052543               0.052543
0                     has_5g    0.049908               0.049908
4             charging_speed    0.025561               0.025561
3               battery_life    0.020883               0.020883
Manual Weights:
battery_life: 0.25
primary_camera_resolution: 0.2
ram_amount: 0.15
charging_speed: 0.15
inbuilt_storage: 0.1
clock_speed_ghz: 0.1
has_5g: 0.05


In [151]:
# Example user inputs
screen_size_preferences = ['large', 'medium']  # Can be ['small', 'medium', 'large']
price_range = (5000, 50000)  # Example price range in your currency
user_query = "i want a large phone which has a good camera and fast speed"

# Filter based on user preferences for screen size and price range
filtered_data = data[(data['screen_size_category'].isin(screen_size_preferences)) & 
                     (data['price'] >= price_range[0]) & 
                     (data['price'] <= price_range[1])]

print("Filtered Data:")
print(filtered_data[['model', 'price', 'screen_size_category']])


Filtered Data:
                                       model    price screen_size_category
1                  OnePlus Nord CE 2 Lite 5G  19989.0                large
2                      Samsung Galaxy A14 5G  16499.0                large
3                       Motorola Moto G62 5G  14999.0                large
4                         Realme 10 Pro Plus  24999.0                large
5    Samsung Galaxy F23 5G (6GB RAM + 128GB)  16999.0                large
..                                       ...      ...                  ...
840               Motorola Moto Edge S30 Pro  34990.0                large
841                              Honor X8 5G  14990.0                large
842          POCO X4 GT 5G (8GB RAM + 256GB)  28990.0                large
843                     Motorola Moto G91 5G  19990.0                large
844                   Samsung Galaxy M52s 5G  24990.0                large

[751 rows x 3 columns]


In [152]:
# Adjust weights based on the query
def adjust_weights(manual_weights, query, boost_factor=1.5):
    feature_keywords = {
        'camera': 'primary_camera_resolution',
        'fast': 'clock_speed_ghz'
    }
    adjusted_weights = manual_weights.copy()
    for keyword, feature in feature_keywords.items():
        if keyword in query:
            adjusted_weights[feature] *= boost_factor
    total_weight = sum(adjusted_weights.values())
    normalized_adjusted_weights = {k: v / total_weight for k, v in adjusted_weights.items()}
    return normalized_adjusted_weights

# Adjust weights
adjusted_weights = adjust_weights(manual_weights, user_query)
adjusted_weights1 = adjust_weights(weights, user_query)

print("Adjusted Weights:")
print(adjusted_weights)
print(adjusted_weights1)

Adjusted Weights:
{'battery_life': 0.21739130434782605, 'primary_camera_resolution': 0.2608695652173913, 'ram_amount': 0.13043478260869562, 'charging_speed': 0.13043478260869562, 'inbuilt_storage': 0.08695652173913043, 'clock_speed_ghz': 0.13043478260869565, 'has_5g': 0.043478260869565216}
{'ram_amount': 0.5989911823922999, 'primary_camera_resolution': 0.19102172307267698, 'inbuilt_storage': 0.05017307907980884, 'clock_speed_ghz': 0.07190648579495283, 'has_5g': 0.04553429062429272, 'charging_speed': 0.023320804024399722, 'battery_life': 0.01905243501156901}


In [143]:
# Function to calculate the score
def calculate_score(row, manual_weights):
    score = 0
    for feature in manual_weights.keys():
        score += row[feature] * manual_weights[feature]
    return score

# Ensure all necessary features are present in the filtered dataset
print("Filtered Data Columns:")
print(filtered_data.columns)

Filtered Data Columns:
Index(['model', 'price', 'rating', 'company', 'is_dual_sim', 'has_5g',
       'supports_volte', 'supports_wifi', 'supports_nfc', 'ram_amount',
       'inbuilt_storage', 'battery_life', 'charging_speed', 'screen_size',
       'resolution_width', 'resolution_height', 'memory_card_supported',
       'primary_camera_resolution', 'num_rear_cameras', 'clock_speed_ghz',
       'screen_size_category', 'generated_rating'],
      dtype='object')


In [153]:
# Apply the scoring function to each row in the filtered dataset
filtered_data['generated_rating'] = filtered_data.apply(lambda row: calculate_score(row, adjusted_weights), axis=1)
filtered_data['generated_rating1'] = filtered_data.apply(lambda row: calculate_score(row, adjusted_weights1), axis=1)
# Sort the phones by the generated ratings
top_5_phones = filtered_data.sort_values(by='generated_rating', ascending=False).head(5)
top_5_phones1 = filtered_data.sort_values(by='generated_rating1', ascending=False).head(5)
# Display the top 5 phones
print("Top 5 Phones:")
print(top_5_phones[['model', 'price', 'screen_size', 'generated_rating']])
print(top_5_phones1[['model', 'price', 'screen_size', 'generated_rating1']])

Top 5 Phones:
                              model    price  screen_size  generated_rating
732                    Doogee V Max  45999.0         6.58       4839.295652
345                    Oukitel WP19  29990.0         6.78       4609.834783
417              Samsung Galaxy F63  21999.0         6.70       1555.008696
716              Samsung Galaxy M62  23999.0         6.71       1555.008261
251  Tecno Pova 3 (6GB RAM + 128GB)  13799.0         6.90       1551.313043
                  model    price  screen_size  generated_rating1
732        Doogee V Max  45999.0         6.58         460.818196
345        Oukitel WP19  29990.0         6.78         430.904925
800        Nokia X60 5G  39990.0         6.51         163.580277
417  Samsung Galaxy F63  21999.0         6.70         156.596233
716  Samsung Galaxy M62  23999.0         6.71         156.574428


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['generated_rating'] = filtered_data.apply(lambda row: calculate_score(row, adjusted_weights), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['generated_rating1'] = filtered_data.apply(lambda row: calculate_score(row, adjusted_weights1), axis=1)
