In [1]:
!pip install keras==2.12.0




In [8]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
import warnings
warnings.filterwarnings('ignore')

class ANNClassifier:
    def __init__(self):
        self.pipeline = None
        self.label_encoder_category = None
        self.label_encoder_main_promotion = None
        self.label_encoder_color = None
        self.scaler = None
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None

    def load_data(self, file_path):
        self.data = pd.read_csv(file_path)

    def preprocess_data(self):
        x = self.data.drop(['success_indicator', 'item_no'], axis=1)
        y = self.data['success_indicator']

        # Encoding categorical variables
        self.label_encoder_category = LabelEncoder()
        x['category_encoded'] = self.label_encoder_category.fit_transform(x['category'])
        x.drop('category', axis=1, inplace=True)

        self.label_encoder_main_promotion = LabelEncoder()
        x['main_promotion_encoded'] = self.label_encoder_main_promotion.fit_transform(x['main_promotion'])
        x.drop('main_promotion', axis=1, inplace=True)

        self.label_encoder_color = LabelEncoder()
        x['color_encoded'] = self.label_encoder_color.fit_transform(x['color'])
        x.drop('color', axis=1, inplace=True)

        # Binning stars ratings
        x['stars'] = np.where(x['stars'] <= 3, 0, 1)

        # Encoding target variable ('FLOP' as 0, 'TOP' as 1)
        label_encoder_target = LabelEncoder()
        y_encoded = label_encoder_target.fit_transform(y)
        y_encoded = np.where(y_encoded == label_encoder_target.classes_.tolist().index('flop'), 0, y_encoded)
        y_encoded = np.where(y_encoded == label_encoder_target.classes_.tolist().index('top'), 1, y_encoded)

        # Splitting the data into train and test sets
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=77)

        # Scaling features
        self.scaler = StandardScaler()
        self.x_train = self.scaler.fit_transform(self.x_train)

    def create_model(self):
        model = Sequential()
        model.add(Dense(10, input_dim=4, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model

    def fit_model(self):
        keras_model = KerasClassifier(build_fn=self.create_model, epochs=10, batch_size=32, verbose=0)
        self.pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', keras_model)
        ])
        self.pipeline.fit(self.x_train, self.y_train)

    def predict(self):
        return self.pipeline.predict(self.x_test)

    def evaluate_model(self):
        y_pred = self.predict()
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred)
        recall = recall_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred)
        print("Classification Report:")
        print(classification_report(self.y_test, y_pred))
        print("Accuracy: {:.2f}".format(accuracy))
        print("Precision: {:.2f}".format(precision))
        print("Recall: {:.2f}".format(recall))
        print("F1 Score: {:.2f}".format(f1))

    def load_test_data(self, file_path):
        self.test_data = pd.read_csv(file_path)

    def preprocess_test_data(self):
        test_data_processed = self.test_data.drop(['item_no'], axis=1)
        test_data_processed['category_encoded'] = self.label_encoder_category.transform(test_data_processed['category'])
        test_data_processed.drop('category', axis=1, inplace=True)
        test_data_processed['main_promotion_encoded'] = self.label_encoder_main_promotion.transform(test_data_processed['main_promotion'])
        test_data_processed.drop('main_promotion', axis=1, inplace=True)
        test_data_processed['color_encoded'] = self.label_encoder_color.transform(test_data_processed['color'])
        test_data_processed.drop('color', axis=1, inplace=True)
        test_data_processed['stars'] = np.where(test_data_processed['stars'] <= 3, 0, 1)
        test_data_processed = self.scaler.transform(test_data_processed)
        return test_data_processed

    def predict_for_test_data(self):
        test_data_processed = self.preprocess_test_data()
        return self.pipeline.predict(test_data_processed)

# Initialize the classifier
pipeline = ANNClassifier()

# Load and preprocess the training data
pipeline.load_data('/content/historic.csv')
pipeline.preprocess_data()

# Create and train the model
pipeline.create_model()
pipeline.fit_model()

# Evaluate the model
pipeline.evaluate_model()

# Load and preprocess the test data
pipeline.load_test_data('/content/prediction_input.csv')
predicted_classes = pipeline.predict_for_test_data()


Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       564
           1       0.65      1.00      0.79      1036

    accuracy                           0.65      1600
   macro avg       0.32      0.50      0.39      1600
weighted avg       0.42      0.65      0.51      1600

Accuracy: 0.65
Precision: 0.65
Recall: 1.00
F1 Score: 0.79


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Load the historic data
df = pd.read_csv('/content/historic.csv')

class Random_forest_pipeline:
    def __init__(self):
        self.rf_clf = None
        self.scaler = None
        self.label_encoder_category = None
        self.label_encoder_main_promotion = None
        self.label_encoder_color = None

    def load_data(self, file_path):
        """Load the data from the specified file path."""
        self.data = pd.read_csv(file_path)

    def preprocess_data(self):
        """Preprocess the data."""
        x = self.data.drop(['success_indicator', 'item_no'], axis=1)
        y = self.data['success_indicator']

        # Encode categorical variables
        self.label_encoder_category = LabelEncoder()
        x['category_encoded'] = self.label_encoder_category.fit_transform(x['category'])
        x.drop('category', axis=1, inplace=True)

        self.label_encoder_main_promotion = LabelEncoder()
        x['main_promotion_encoded'] = self.label_encoder_main_promotion.fit_transform(x['main_promotion'])
        x.drop('main_promotion', axis=1, inplace=True)

        self.label_encoder_color = LabelEncoder()
        x['color_encoded'] = self.label_encoder_color.fit_transform(x['color'])
        x.drop('color', axis=1, inplace=True)

        # Bin stars rating into two categories
        x['stars'] = np.where(x['stars'] <= 3, 0, 1)

        # Encoding target variable ('flop' as 0, 'top' as 1)
        label_encoder_target = LabelEncoder()
        y_encoded = label_encoder_target.fit_transform(y)
        y_encoded = np.where(y_encoded == label_encoder_target.classes_.tolist().index('flop'), 0, y_encoded)
        y_encoded = np.where(y_encoded == label_encoder_target.classes_.tolist().index('top'), 1, y_encoded)

        # Split the data into training and testing sets
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

        # Scale features
        self.scaler = StandardScaler()
        self.x_train = self.scaler.fit_transform(self.x_train)

    def train_model(self):
        """Train the Random Forest model."""
        self.rf_clf = RandomForestClassifier()
        self.rf_clf.fit(self.x_train, self.y_train)

    def test_model(self):
        """Test the trained model."""
        y_pred = self.rf_clf.predict(self.scaler.transform(self.x_test))

        # Evaluate the model performance
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred)
        recall = recall_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred)

        print("Accuracy with Random Forest Classifier:", accuracy)
        print("Precision with Random Forest Classifier:", precision)
        print("Recall with Random Forest Classifier:", recall)
        print("F1 Score with Random Forest Classifier:", f1)

        # Print classification report
        print("Classification Report:")
        print(classification_report(self.y_test, y_pred))

    def load_test_data(self, file_path):
        """Load the unlabelled test data."""
        self.test_data = pd.read_csv(file_path)

    def preprocess_test_data(self):
        """Preprocess the test data."""
        test_data_processed = self.test_data.drop(['item_no'], axis=1)
        test_data_processed['category_encoded'] = self.label_encoder_category.transform(test_data_processed['category'])
        test_data_processed.drop('category', axis=1, inplace=True)
        test_data_processed['main_promotion_encoded'] = self.label_encoder_main_promotion.transform(test_data_processed['main_promotion'])
        test_data_processed.drop('main_promotion', axis=1, inplace=True)
        test_data_processed['color_encoded'] = self.label_encoder_color.transform(test_data_processed['color'])
        test_data_processed.drop('color', axis=1, inplace=True)
        test_data_processed['stars'] = np.where(test_data_processed['stars'] <= 3, 0, 1)
        test_data_processed = self.scaler.transform(test_data_processed)
        return test_data_processed

    def predict_for_test_data(self):
        """Predict the success indicator for the test data."""
        test_data_processed = self.preprocess_test_data()
        return self.rf_clf.predict(test_data_processed)

# Initialize the pipeline
pipeline = Random_forest_pipeline()

# Load and preprocess the historic data
pipeline.load_data('/content/historic.csv')
pipeline.preprocess_data()

# Train the model
pipeline.train_model()

# Test the model
pipeline.test_model()

# Load and preprocess the test data
pipeline.load_test_data('/content/prediction_input.csv')
predicted_classes = pipeline.predict_for_test_data()

Accuracy with Random Forest Classifier: 0.843125
Precision with Random Forest Classifier: 0.8504504504504504
Recall with Random Forest Classifier: 0.9173955296404276
F1 Score with Random Forest Classifier: 0.8826554464703131
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.71      0.76       571
           1       0.85      0.92      0.88      1029

    accuracy                           0.84      1600
   macro avg       0.84      0.81      0.82      1600
weighted avg       0.84      0.84      0.84      1600



In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Load the historic data
df = pd.read_csv('/content/historic.csv')

class LogisticRegressionPipeline:
    def __init__(self):
        self.lr_clf = None
        self.scaler = None
        self.label_encoder_category = None
        self.label_encoder_main_promotion = None
        self.label_encoder_color = None

    def load_data(self, file_path):
        """Load the data from the specified file path."""
        self.data = pd.read_csv(file_path)

    def preprocess_data(self):
        """Preprocess the data."""
        x = self.data.drop(['success_indicator', 'item_no'], axis=1)
        y = self.data['success_indicator']

        # Encode categorical variables
        self.label_encoder_category = LabelEncoder()
        x['category_encoded'] = self.label_encoder_category.fit_transform(x['category'])
        x.drop('category', axis=1, inplace=True)

        self.label_encoder_main_promotion = LabelEncoder()
        x['main_promotion_encoded'] = self.label_encoder_main_promotion.fit_transform(x['main_promotion'])
        x.drop('main_promotion', axis=1, inplace=True)

        self.label_encoder_color = LabelEncoder()
        x['color_encoded'] = self.label_encoder_color.fit_transform(x['color'])
        x.drop('color', axis=1, inplace=True)

        # Bin stars rating into two categories
        x['stars'] = np.where(x['stars'] <= 3, 0, 1)

        # Encoding target variable ('flop' as 0, 'top' as 1)
        label_encoder_target = LabelEncoder()
        y_encoded = label_encoder_target.fit_transform(y)
        y_encoded = np.where(y_encoded == label_encoder_target.classes_.tolist().index('flop'), 0, y_encoded)
        y_encoded = np.where(y_encoded == label_encoder_target.classes_.tolist().index('top'), 1, y_encoded)

        # Split the data into training and testing sets
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

        # Scale features
        self.scaler = StandardScaler()
        self.x_train = self.scaler.fit_transform(self.x_train)

    def train_model(self):
        """Train the Logistic Regression model."""
        self.lr_clf = LogisticRegression()
        self.lr_clf.fit(self.x_train, self.y_train)

    def test_model(self):
        """Test the trained model."""
        y_pred = self.lr_clf.predict(self.scaler.transform(self.x_test))

        # Evaluate the model performance
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred)
        recall = recall_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred)

        print("Accuracy with Logistic Regression Classifier:", accuracy)
        print("Precision with Logistic Regression Classifier:", precision)
        print("Recall with Logistic Regression Classifier:", recall)
        print("F1 Score with Logistic Regression Classifier:", f1)

        # Print classification report
        print("Classification Report:")
        print(classification_report(self.y_test, y_pred))

    def load_test_data(self, file_path):
        """Load the unlabelled test data."""
        self.test_data = pd.read_csv(file_path)

    def preprocess_test_data(self):
        """Preprocess the test data."""
        test_data_processed = self.test_data.drop(['item_no'], axis=1)
        test_data_processed['category_encoded'] = self.label_encoder_category.transform(test_data_processed['category'])
        test_data_processed.drop('category', axis=1, inplace=True)
        test_data_processed['main_promotion_encoded'] = self.label_encoder_main_promotion.transform(test_data_processed['main_promotion'])
        test_data_processed.drop('main_promotion', axis=1, inplace=True)
        test_data_processed['color_encoded'] = self.label_encoder_color.transform(test_data_processed['color'])
        test_data_processed.drop('color', axis=1, inplace=True)
        test_data_processed['stars'] = np.where(test_data_processed['stars'] <= 3, 0, 1)
        test_data_processed = self.scaler.transform(test_data_processed)
        return test_data_processed

    def predict_for_test_data(self):
        """Predict the success indicator for the test data."""
        test_data_processed = self.preprocess_test_data()
        return self.lr_clf.predict(test_data_processed)

pipeline = LogisticRegressionPipeline()
pipeline.load_data('/content/historic.csv')
pipeline.preprocess_data()
pipeline.train_model()
pipeline.test_model()
pipeline.load_test_data('/content/prediction_input.csv')
predicted_classes = pipeline.predict_for_test_data()


Accuracy with Logistic Regression Classifier: 0.784375
Precision with Logistic Regression Classifier: 0.8220338983050848
Recall with Logistic Regression Classifier: 0.8483965014577259
F1 Score with Logistic Regression Classifier: 0.8350071736011477
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.67      0.69       571
           1       0.82      0.85      0.84      1029

    accuracy                           0.78      1600
   macro avg       0.77      0.76      0.76      1600
weighted avg       0.78      0.78      0.78      1600



In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

class SVMPipeline:
    def __init__(self):
        self.svm_clf = None
        self.scaler = None
        self.label_encoder_category = None
        self.label_encoder_main_promotion = None
        self.label_encoder_color = None

    def load_data(self, file_path):
        """Load the data from the specified file path."""
        self.data = pd.read_csv(file_path)

    def preprocess_data(self):
        """Preprocess the data."""
        x = self.data.drop(['success_indicator', 'item_no'], axis=1)
        y = self.data['success_indicator']

        # Encode categorical variables
        self.label_encoder_category = LabelEncoder()
        x['category_encoded'] = self.label_encoder_category.fit_transform(x['category'])
        x.drop('category', axis=1, inplace=True)

        self.label_encoder_main_promotion = LabelEncoder()
        x['main_promotion_encoded'] = self.label_encoder_main_promotion.fit_transform(x['main_promotion'])
        x.drop('main_promotion', axis=1, inplace=True)

        self.label_encoder_color = LabelEncoder()
        x['color_encoded'] = self.label_encoder_color.fit_transform(x['color'])
        x.drop('color', axis=1, inplace=True)

        # Bin stars rating into two categories
        x['stars'] = np.where(x['stars'] <= 3, 0, 1)

        # Encoding target variable ('flop' as 0, 'top' as 1)
        label_encoder_target = LabelEncoder()
        y_encoded = label_encoder_target.fit_transform(y)
        y_encoded = np.where(y_encoded == label_encoder_target.classes_.tolist().index('flop'), 0, y_encoded)
        y_encoded = np.where(y_encoded == label_encoder_target.classes_.tolist().index('top'), 1, y_encoded)

        # Split the data into training and testing sets
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

        # Scale features
        self.scaler = StandardScaler()
        self.x_train = self.scaler.fit_transform(self.x_train)

    def train_model(self):
        """Train the SVM model."""
        self.svm_clf = SVC(kernel='linear')
        self.svm_clf.fit(self.x_train, self.y_train)

    def test_model(self):
        """Test the trained model."""
        y_pred = self.svm_clf.predict(self.scaler.transform(self.x_test))

        # Evaluate the model performance
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred)
        recall = recall_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred)

        print("Accuracy with SVM Classifier:", accuracy)
        print("Precision with SVM Classifier:", precision)
        print("Recall with SVM Classifier:", recall)
        print("F1 Score with SVM Classifier:", f1)

        # Print classification report
        print("Classification Report:")
        print(classification_report(self.y_test, y_pred))

    def load_test_data(self, file_path):
        """Load the unlabelled test data."""
        self.test_data = pd.read_csv(file_path)

    def preprocess_test_data(self):
        """Preprocess the test data."""
        test_data_processed = self.test_data.drop(['item_no'], axis=1)
        test_data_processed['category_encoded'] = self.label_encoder_category.transform(test_data_processed['category'])
        test_data_processed.drop('category', axis=1, inplace=True)
        test_data_processed['main_promotion_encoded'] = self.label_encoder_main_promotion.transform(test_data_processed['main_promotion'])
        test_data_processed.drop('main_promotion', axis=1, inplace=True)
        test_data_processed['color_encoded'] = self.label_encoder_color.transform(test_data_processed['color'])
        test_data_processed.drop('color', axis=1, inplace=True)
        test_data_processed['stars'] = np.where(test_data_processed['stars'] <= 3, 0, 1)
        test_data_processed = self.scaler.transform(test_data_processed)
        return test_data_processed

    def predict_for_test_data(self):
        """Predict the success indicator for the test data."""
        test_data_processed = self.preprocess_test_data()
        return self.svm_clf.predict(test_data_processed)

# Initialize the SVM pipeline
pipeline = SVMPipeline()

# Load and preprocess the data
pipeline.load_data('/content/historic.csv')
pipeline.preprocess_data()

# Train the SVM model
pipeline.train_model()

# Test the model
pipeline.test_model()

# Load and preprocess the test data
pipeline.load_test_data('/content/prediction_input.csv')
predicted_classes = pipeline.predict_for_test_data()


Accuracy with SVM Classifier: 0.784375
Precision with SVM Classifier: 0.8220338983050848
Recall with SVM Classifier: 0.8483965014577259
F1 Score with SVM Classifier: 0.8350071736011477
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.67      0.69       571
           1       0.82      0.85      0.84      1029

    accuracy                           0.78      1600
   macro avg       0.77      0.76      0.76      1600
weighted avg       0.78      0.78      0.78      1600



In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

class KNNPipeline:
    def __init__(self):
        self.knn_clf = None
        self.scaler = None
        self.label_encoder_category = None
        self.label_encoder_main_promotion = None
        self.label_encoder_color = None

    def load_data(self, file_path):
        """Load the data from the specified file path."""
        self.data = pd.read_csv(file_path)

    def preprocess_data(self):
        """Preprocess the data."""
        x = self.data.drop(['success_indicator', 'item_no'], axis=1)
        y = self.data['success_indicator']

        # Encode categorical variables
        self.label_encoder_category = LabelEncoder()
        x['category_encoded'] = self.label_encoder_category.fit_transform(x['category'])
        x.drop('category', axis=1, inplace=True)

        self.label_encoder_main_promotion = LabelEncoder()
        x['main_promotion_encoded'] = self.label_encoder_main_promotion.fit_transform(x['main_promotion'])
        x.drop('main_promotion', axis=1, inplace=True)

        self.label_encoder_color = LabelEncoder()
        x['color_encoded'] = self.label_encoder_color.fit_transform(x['color'])
        x.drop('color', axis=1, inplace=True)

        # Bin stars rating into two categories
        x['stars'] = np.where(x['stars'] <= 3, 0, 1)

        # Encoding target variable ('flop' as 0, 'top' as 1)
        label_encoder_target = LabelEncoder()
        y_encoded = label_encoder_target.fit_transform(y)
        y_encoded = np.where(y_encoded == label_encoder_target.classes_.tolist().index('flop'), 0, y_encoded)
        y_encoded = np.where(y_encoded == label_encoder_target.classes_.tolist().index('top'), 1, y_encoded)

        # Split the data into training and testing sets
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

        # Scale features
        self.scaler = StandardScaler()
        self.x_train = self.scaler.fit_transform(self.x_train)

    def train_model(self):
        """Train the KNN model."""
        self.knn_clf = KNeighborsClassifier(n_neighbors=5)
        self.knn_clf.fit(self.x_train, self.y_train)

    def test_model(self):
        """Test the trained model."""
        y_pred = self.knn_clf.predict(self.scaler.transform(self.x_test))

        # Evaluate the model performance
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred)
        recall = recall_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred)

        print("Accuracy with KNN Classifier:", accuracy)
        print("Precision with KNN Classifier:", precision)
        print("Recall with KNN Classifier:", recall)
        print("F1 Score with KNN Classifier:", f1)

        # Print classification report
        print("Classification Report:")
        print(classification_report(self.y_test, y_pred))

    def load_test_data(self, file_path):
        """Load the unlabelled test data."""
        self.test_data = pd.read_csv(file_path)

    def preprocess_test_data(self):
        """Preprocess the test data."""
        test_data_processed = self.test_data.drop(['item_no'], axis=1)
        test_data_processed['category_encoded'] = self.label_encoder_category.transform(test_data_processed['category'])
        test_data_processed.drop('category', axis=1, inplace=True)
        test_data_processed['main_promotion_encoded'] = self.label_encoder_main_promotion.transform(test_data_processed['main_promotion'])
        test_data_processed.drop('main_promotion', axis=1, inplace=True)
        test_data_processed['color_encoded'] = self.label_encoder_color.transform(test_data_processed['color'])
        test_data_processed.drop('color', axis=1, inplace=True)
        test_data_processed['stars'] = np.where(test_data_processed['stars'] <= 3, 0, 1)
        test_data_processed = self.scaler.transform(test_data_processed)
        return test_data_processed

    def predict_for_test_data(self):
        """Predict the success indicator for the test data."""
        test_data_processed = self.preprocess_test_data()
        return self.knn_clf.predict(test_data_processed)

# Initialize the KNN pipeline
pipeline = KNNPipeline()

# Load and preprocess the data
pipeline.load_data('/content/historic.csv')
pipeline.preprocess_data()

# Train the KNN model
pipeline.train_model()

# Test the model
pipeline.test_model()

# Load and preprocess the test data
pipeline.load_test_data('/content/prediction_input.csv')
predicted_classes = pipeline.predict_for_test_data()


Accuracy with KNN Classifier: 0.84125
Precision with KNN Classifier: 0.8578024007386889
Recall with KNN Classifier: 0.902818270165209
F1 Score with KNN Classifier: 0.8797348484848485
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.73      0.77       571
           1       0.86      0.90      0.88      1029

    accuracy                           0.84      1600
   macro avg       0.83      0.82      0.82      1600
weighted avg       0.84      0.84      0.84      1600



In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Flatten

# Load the data
df = pd.read_csv('/content/historic.csv')

# Encode categorical variables
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])
df['main_promotion_encoded'] = label_encoder.fit_transform(df['main_promotion'])
df['color_encoded'] = label_encoder.fit_transform(df['color'])

# Split features and target
X = df.drop(['success_indicator', 'item_no', 'category', 'main_promotion', 'color'], axis=1)
y = df['success_indicator']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert data to sequences for RNN input
X_train_seq = np.array(X_train_scaled).reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_seq = np.array(X_test_scaled).reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

# Encode target variable y_train
label_encoder_target = LabelEncoder()
y_train_encoded = label_encoder_target.fit_transform(y_train)

# Define the RNN model
model = Sequential([
    SimpleRNN(64, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2]), activation='relu'),
    Flatten(),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with encoded y_train
model.fit(X_train_seq, y_train_encoded, epochs=10, batch_size=32, validation_split=0.2)

# Encode target variable y_test
y_test_encoded = label_encoder_target.transform(y_test)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_seq, y_test_encoded)
print(f"Test Accuracy: {accuracy}")

# Make predictions
y_pred = (model.predict(X_test_seq) > 0.5).astype("int32")

from sklearn.metrics import classification_report

# Generate classification report
report = classification_report(y_test_encoded, y_pred)
print("Classification Report:\n", report)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.8168749809265137
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.64      0.71       571
           1       0.82      0.92      0.87      1029

    accuracy                           0.82      1600
   macro avg       0.81      0.78      0.79      1600
weighted avg       0.82      0.82      0.81      1600

