<a href="https://colab.research.google.com/github/DeemaEssam/AI/blob/main/Riyadh_RealEstate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Load the dataset
data_path = '/content/Riyadh_RealEstate.csv'  # Adjust the path to your dataset
df = pd.read_csv(data_path)

# Example mapping of Arabic to English column names
# Adjust the names according to your dataset
column_name_mapping = {
    'نوع العقار': 'Property Type',
    'الغرض': 'Purpose',
    'المدينة': 'City',
    'الحي': 'District',
    'الواجهة': 'Front',
    'المساحة': 'Area',
    'سعر المتر': 'Price per Meter',
    'عدد الغرف': 'Number of Rooms',
    'عدد الصالات': 'Number of Halls',
    'عدد الحمامات': 'Number of Bathrooms',
    'مطبخ': 'Kitchen',
    'غرفة خادمة': 'Maid Room',
    'غرفة سائق': 'Driver Room',
    'ملحق': 'Annex',
    'حوش': 'Yard',
    'مسبح': 'Pool',
    'قبو': 'Basement',
    'مدخل سيارة': 'Car Entrance',
    'مصعد': 'Elevator',
    'السعر الاجمالي': 'Total Price'
}

# Renaming the columns in the DataFrame
df_renamed = df.rename(columns=column_name_mapping)

# Convert 'city', 'district', and 'front' columns to numeric format using get_dummies
df_encoded = pd.get_dummies(df_renamed, columns=['Property Type', 'Purpose', 'City', 'District', 'Front'])

# Assuming 'price' is the target variable, and excluding 'details' column as well
X = df_encoded.drop(['Total Price'], axis=1)
y = df_renamed['Total Price']
df_encoded

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing and fitting the RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Extracting feature importances
feature_importances = rf.feature_importances_

# Mapping feature importances to the corresponding column names
features = pd.Series(feature_importances, index=X_train.columns).sort_values(ascending=False)
# Assuming all previous steps are correctly implemented

# Identifying the names of the top 10 features
top_features_names = features.head(10).index.tolist()

# Filtering the original dataset to include only these top 10 features plus the target variable
filtered_df = df_encoded[top_features_names + ['Total Price']]

# Now, you might want to save this filtered dataset to a new CSV file
filtered_df.to_csv('newRealEstate.csv', index=False)

# If you just want to display the top 10 feature names, you can do so directly
print(top_features_names)


['Price per Meter', 'Area', 'District_ حي المهدية ', 'Driver Room', 'Purpose_سكني', 'City_ الرياض ', 'Front_أربع شوارع', 'Number of Rooms', 'District_ حي الرمال ', 'Front_غربية']
