# Dendrite.ai

### SUBMITED BY : FAMIT DONGARWAR

#### Assignment for the Data Science Internship opening at Dendrite.ai, an AI/ML startup!

### Without parse

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import warnings

# Suppress any warnings
warnings.filterwarnings('ignore')

# Loading Iris dataset
iris_data = pd.read_csv('iris.csv')

# Exploring the dataset
# Take a look at the first five rows to understand its structure
print("Here are the first five rows of the dataset:\n", iris_data.head())

# Summary of the dataset's structure and data types
print("\nDataset Information:\n")
iris_data.info()

# Summary statistics for numerical features
print("\nStatistical Summary:\n", iris_data.describe())

# Check for any missing values in the dataset
print("\nCount of Missing Values in Each Column:\n", iris_data.isnull().sum())

# Check how many samples we have for each species
print("\nDistribution of Species in the Dataset:\n", iris_data['species'].value_counts())

# Prepare the data for modeling
# Convert the categorical target variable (species) into numerical format
label_encoder = LabelEncoder()
iris_data['species'] = label_encoder.fit_transform(iris_data['species'])

# Separate the features "X" from the target variable "y"
features = iris_data.drop('species', axis=1)  # All columns except 'species'
target = iris_data['species']  # The 'species' column

# Standardize the feature values to have a mean of 0 and a standard deviation of 1
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Split the dataset into training and testing sets
# We will use 80% of the data for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42, stratify=target)

# Train various classification models
# We will evaluate several different algorithms to see which performs best
classification_models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC()
}

# Dictionary to store the accuracy of each model
model_performance = {}

# Train each model and evaluate its performance
for model_name, model_instance in classification_models.items():
    model_instance.fit(X_train, y_train)  # Train the model
    predictions = model_instance.predict(X_test)  # Make predictions on the test set
    accuracy = accuracy_score(y_test, predictions)  # Calculate accuracy
    model_performance[model_name] = accuracy  # Store the accuracy
    print(f"\nPerformance of {model_name}:")
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, predictions))

# Fine-tune the best model (Random Forest)
# We will perform hyperparameter tuning to improve the Random Forest model
rf_param_grid = {
    'n_estimators': [10, 50, 100],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10]  # Minimum number of samples required to split an internal node
}

# Grid search for hyperparameter tuning
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=5, scoring='accuracy')
rf_grid_search.fit(X_train, y_train)

# Display the best parameters found during tuning
print("\nOptimal Parameters for Random Forest:", rf_grid_search.best_params_)
best_rf_model = rf_grid_search.best_estimator_  # Get the best model
rf_predictions = best_rf_model.predict(X_test)  # Make predictions with the tuned model
print("\nPerformance of Tuned Random Forest:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print("Classification Report:\n", classification_report(y_test, rf_predictions))

# Compare the performance of all models
print("\nModel Performance Comparison:")
for model_name, accuracy in model_performance.items():
    print(f"{model_name}: {accuracy:.4f}")

# Include the tuned Random Forest in the performance comparison
model_performance['Tuned Random Forest'] = accuracy_score(y_test, rf_predictions)

# Identify the model with the highest accuracy
best_model_name = max(model_performance, key=model_performance.get)
best_model_accuracy = model_performance[best_model_name]

# Display the best model and its accuracy
print(f"\nThe Best Performing Model is: {best_model_name} with an Accuracy of: {best_model_accuracy:.4f}")

Here are the first five rows of the dataset:
    sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa

Dataset Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB

Statistical Summary:
        sepal_length  sepal_width  pet

### With parse and classification 

In [8]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Loading our dataset, both CSV and JSON formats.
def load_data(file_path):
    if file_path.endswith('.json'):
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
        return pd.DataFrame(data)
    else:
        return pd.read_csv(file_path)

# Function to help us save our predictions for a JSON file.
def save_predictions(file_path, true_labels, predicted_labels):
    results = {
        "true_labels": true_labels.tolist(),
        "predicted_labels": predicted_labels.tolist()
    }
    with open(file_path, 'w') as json_file:
        json.dump(results, json_file, indent=4)

# Get data ready for analysis.
def preprocess_data(df):
    features = df.iloc[:, :-1]  # All columns except the last one for features
    target = df.iloc[:, -1]      # Last column is target variable

    # We need to convert the target variable into numbers so our models can understand it.
    label_encoder = LabelEncoder()
    target_encoded = label_encoder.fit_transform(target)

    # Standardize features so they all have a similar scale.
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    return features_scaled, target_encoded, label_encoder

# Train and evaluate a few different classifiers.
def evaluate_classifiers(X_train, X_test, y_train, y_test):
    models = {
        "Random Forest": RandomForestClassifier(random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Naive Bayes": GaussianNB()
    }

    results = {}

    for model_name, model in models.items():
        model.fit(X_train, y_train)  # Train the model on the training data
        predictions = model.predict(X_test)  # Make predictions on the test data
        accuracy = accuracy_score(y_test, predictions)  # Calculate accuracy
        results[model_name] = {
            "accuracy": accuracy,
            "report": classification_report(y_test, predictions, output_dict=True)
        }
        print(f"Results for {model_name}:")
        print(f"Accuracy: {accuracy:.4f}")
        print(classification_report(y_test, predictions))

    return results

# Main function.
def main():
    dataset_path = "iris.csv"
    print("Loading the dataset...!")
    data = load_data(dataset_path)

    print("Preprocessing the data... Getting it ready for analysis...!")
    X, y, label_encoder = preprocess_data(data)

    print("Splitting the data into training and testing sets...!")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Training and evaluating classifiers...!")
    results = evaluate_classifiers(X_train, X_test, y_train, y_test)

    # Save the predictions from the best model, which we'll assume is Random Forest for now.
    print("Training the best model (Random Forest) and saving predictions...")
    best_model = RandomForestClassifier(random_state=42)
    best_model.fit(X_train, y_train)
    predictions = best_model.predict(X_test)

    save_predictions("prediction1.json", label_encoder.inverse_transform(y_test), label_encoder.inverse_transform(predictions))
    print("Predictions saved to prediction1.json.")

if __name__ == "__main__":
    main()

Loading the dataset...!
Preprocessing the data... Getting it ready for analysis...!
Splitting the data into training and testing sets...!
Training and evaluating classifiers...!
Results for Random Forest:
Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Results for Decision Tree:
Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00    

### For regression

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
import json

# Loading a CSV file into a pandas DataFrame.
def load_data(file_path):
    return pd.read_csv(file_path)

# Prepare the dataset for modeling by handling categorical variables and scaling numerical features.   
def prepare_data(df, target_col, task_type):
    features = df.drop(columns=[target_col]) # Separate features and target variable
    target = df[target_col]

    # Convert categorical variables into dummy variables
    features = pd.get_dummies(features, drop_first=True)

    # Encode the target variable
    encoder = None
    if task_type == "classification":
        encoder = LabelEncoder()
        target = encoder.fit_transform(target)

    # Scale the numerical features to have a mean of 0 and a standard deviation of 1
    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    return features, target, encoder

# Train various models and evaluate their performance
def fit_and_evaluate_models(X_train, X_test, y_train, y_test, task_type):
    # Define the models
    models = {
        "Linear Regression": LinearRegression(),
        "Random Forest Regressor": RandomForestRegressor(random_state=42),
    } if task_type == "regression" else {
        "Logistic Regression": LogisticRegression(max_iter=200),
        "Random Forest Classifier": RandomForestClassifier(random_state=42),
    }

    best_model = None
    best_score = -np.inf if task_type == "regression" else 0

    # Loop through each model, train and evaluate its performance
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        if task_type == "regression":
            mse = mean_squared_error(y_test, predictions)
            r2 = r2_score(y_test, predictions)
            print(f"{model_name} - Mean Squared Error: {mse:.4f}, R^2 Score: {r2:.4f}")

            if r2 > best_score:
                best_score = r2
                best_model = model
        else:  # classification
            accuracy = accuracy_score(y_test, predictions)
            print(f"{model_name} - Accuracy: {accuracy:.4f}")
            print(classification_report(y_test, predictions))

            if accuracy > best_score:
                best_score = accuracy
                best_model = model

    return best_model

# Save the actual and predicted values to a JSON file
def export_predictions_to_json(filename, actual, predicted, encoder=None):
    results = {
        "actual_values": actual.tolist(),
        "predicted_values": predicted.tolist(),
    }

    # As we have an encoder, we can also save the original labels
    if encoder:
        results["actual_labels"] = encoder.inverse_transform(actual).tolist()
        results["predicted_labels"] = encoder.inverse_transform(predicted).tolist()

    with open(filename, "w") as json_file:
        json.dump(results, json_file)

# Function to run the entire data processing and modeling pipeline.
def run_pipeline():
    data_file = "iris.csv"  # Dataset path
    target_variable = "petal_width"  # Target variable
    task = "regression"

    # Loading dataset
    dataset = load_data(data_file)

    # Prepare data for modeling
    features, target, label_encoder = prepare_data(dataset, target_variable, task)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

    # Train models and find the best one
    best_model = fit_and_evaluate_models(X_train, X_test, y_train, y_test, task)

    # Make predictions
    predictions = best_model.predict(X_test)

    # Save the predictions to a JSON file
    export_predictions_to_json("prediction2.json", y_test, predictions, label_encoder)

if __name__ == "__main__":
    run_pipeline()

Linear Regression - Mean Squared Error: 0.0293, R^2 Score: 0.9538
Random Forest Regressor - Mean Squared Error: 0.0363, R^2 Score: 0.9429
