In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import LabelEncoder

# Load the company data
company_data = pd.read_csv('Company_Data.csv')

def encode_categorical_columns(df):
    encoded_df = df.copy()
    encoder = LabelEncoder()
    
    # Selecting the categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns
    
    # Encoding only categorical variables
    for column in categorical_columns:
        encoded_df[column] = encoder.fit_transform(encoded_df[column])
    
    return encoded_df

company_data = encode_categorical_columns(company_data)

# Assuming the 'Sales' column is the target and the rest are the features
X = company_data.drop('Sales', axis=1)
y = company_data['Sales']

# Splitting the dataset into the training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Instantiate the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_regressor.predict(X_test)

# Calculate RMSE
rmse = sqrt(mean_squared_error(y_test, y_pred))

# Feature importances
feature_importances = pd.DataFrame(rf_regressor.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

# Output the RMSE and feature importances
print(f'RMSE: {rmse}')
print(feature_importances)


RMSE: 1.5995118699934674
             importance
Price          0.309493
ShelveLoc      0.213935
Age            0.133396
CompPrice      0.102397
Advertising    0.089246
Income         0.058862
Population     0.047268
Education      0.031460
Urban          0.008026
US             0.005915


In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the fraud data
fraud_data = pd.read_csv('Fraud_check.csv')

def encode_categorical_columns(df):
    encoded_df = df.copy()
    encoder = LabelEncoder()
    
    # Selecting the categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns
    
    # Encoding only categorical variables
    for column in categorical_columns:
        encoded_df[column] = encoder.fit_transform(encoded_df[column])
    
    return encoded_df

fraud_data = encode_categorical_columns(fraud_data)

# Convert 'Taxable.Income' to a binary classification
fraud_data['Risk'] = fraud_data['Taxable.Income'].apply(lambda x: 'Risky' if x <= 30000 else 'Good')

# Define the features and the target
X = fraud_data.drop(['Taxable.Income', 'Risk'], axis=1)
y = fraud_data['Risk']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output the model performance
print(f'Accuracy: {accuracy}')
print(report)


Accuracy: 0.7555555555555555
              precision    recall  f1-score   support

        Good       0.79      0.95      0.86       143
       Risky       0.00      0.00      0.00        37

    accuracy                           0.76       180
   macro avg       0.39      0.48      0.43       180
weighted avg       0.62      0.76      0.68       180

