<a href="https://colab.research.google.com/github/AlexanderPr03/ML/blob/main/testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV

In [None]:
df = pd.read_csv('your_file.csv')

In [None]:
df.info()
df.head()
df.describe()

In [None]:
df['baseRent'].hist(bins=50)
df.boxplot(column=['baseRent'])

In [None]:
df.duplicated().sum()

In [None]:
df.hist(bins=50, figsize=(20,15))
plt.show()
df.boxplot(column=['baseRent'])
df.plot(kind='scatter', x='livingSpace', y='baseRent', alpha=0.5)

In [None]:
corr = df.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

In [None]:
df.duplicated().sum()

In [None]:
le = LabelEncoder()

for col in df.columns:
    if df[col].dtype == 'object':
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna('Absent')
        df[col] = le.fit_transform(df[col])

NameError: ignored

In [None]:
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
numerical_cols.remove('baseRent')

# Isolation Forest
iso = IsolationForest(contamination=0.1)
outliers_iso = iso.fit_predict(df[numerical_cols])
# df['outliers_iso'] = outliers_iso
print("Nr Outliers cu Isolation Forest: ", list(outliers_iso).count(-1))

# Elliptic Envelope
ee = EllipticEnvelope(contamination=0.1)
outliers_ee = ee.fit_predict(df[numerical_cols])
# df['outliers_ee'] = outliers_ee
print("Nr Outliers cu Elliptic Envelope: ", list(outliers_ee).count(-1))


# Local Outlier Factor
lof = LocalOutlierFactor()
outliers_lof = lof.fit_predict(df[numerical_cols])
# df['outliers_lof'] = outliers_lof
print("Nr. Outliers cu Local Outlier Factor: ", list(outliers_lof).count(-1))


# 4. Box plot pentru fiecare valoare numerica
for col in numerical_cols:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=df[col])
    plt.title(f'Box plot pentru {col}')
    plt.show()

In [None]:
models = {
    'RandomForest': RandomForestRegressor(),
    'Linear Regression': LinearRegression(),
    'SVC': SVR()
}

df_simple = df.copy()
df_knn = df.copy()
df_mice = df.copy()

imputer = SimpleImputer(strategy='mean')
df_simple_imputed = pd.DataFrame(imputer.fit_transform(df_simple))
df_simple_imputed.columns = df_simple.columns
df_simple_imputed.index = df_simple.index

imputer = KNNImputer(n_neighbors=5)
df_knn_imputed = pd.DataFrame(imputer.fit_transform(df_knn))
df_knn_imputed.columns = df_knn.columns
df_knn_imputed.index = df_knn.index

imputer = IterativeImputer(random_state=0)
df_mice_imputed = pd.DataFrame(imputer.fit_transform(df_mice))
df_mice_imputed.columns = df_mice.columns
df_mice_imputed.index = df_mice.index

In [None]:
for model_name, model in models.items():
    print(f'\nModel: {model_name}\n')

    if model_name == 'SVC':
        scaler = StandardScaler()
        df_simple['baseRent'] = scaler.fit_transform(df_simple[['baseRent']])
        df_knn['baseRent'] = scaler.fit_transform(df_knn[['baseRent']])
        df_mice['baseRent'] = scaler.fit_transform(df_mice[['baseRent']])

    #SimpleImputer
    scores_simple = cross_val_score(model, df_simple_imputed.drop('baseRent', axis=1), df_simple_imputed['baseRent'], scoring='neg_mean_squared_error')
    simple_imputer_score = np.sqrt(-scores_simple.mean())
    print('Simple Imputer Scor:', simple_imputer_score)

    # KNNImputer
    scores_knn = cross_val_score(model, df_knn_imputed.drop('baseRent', axis=1), df_knn_imputed['baseRent'], scoring='neg_mean_squared_error')
    knn_imputer_score = np.sqrt(-scores_knn.mean())
    print('KNN Imputer Scor:', knn_imputer_score)

    #IterativeImputer
    scores_mice = cross_val_score(model, df_mice_imputed.drop('baseRent', axis=1), df_mice_imputed['baseRent'], scoring='neg_mean_squared_error')
    mice_imputer_score = np.sqrt(-scores_mice.mean())
    print('MICE Imputer Scor:', mice_imputer_score)

NameError: ignored

In [None]:
X = df.drop('baseRent', axis=1)
y = df['baseRent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

model = RandomForestRegressor()

results = []

for i in range(1, X.shape[1]+1):
    pca = PCA(n_components=i)
    pipeline = make_pipeline(pca, model)

    scores = cross_val_score(pipeline, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
    rmse_score = np.sqrt(-scores.mean())

    results.append({
        'n_componente': i,
        'RMSE': rmse_score,
    })

results_df = pd.DataFrame(results)
print(results_df)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 1, stop = 10, num = 1)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

# You can get the best parameters like this
best_params = rf_random.best_params_
print(best_params)