<a href="https://colab.research.google.com/github/09aryan/MEME_CLASSIFICATION_USING_SENTIMENT_ANALYSIS/blob/main/InceptionV3model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
import csv
import time
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model
from google.colab import drive

def process_image(img_path, model, target_size=(299, 299)):
    img = image.load_img(img_path, target_size=target_size)
    img_array = image.img_to_array(img)
    img_array = preprocess_input(img_array)
    img_array = tf.image.resize(img_array, (299, 299))
    img_array = tf.expand_dims(img_array, 0)  # Create a batch

    features = model.predict(img_array)

    if tf.rank(features) == 4:
        features = tf.reduce_mean(features, axis=(1, 2))
    else:
        # Handle the case where the features have only 2 dimensions
        features = tf.reduce_mean(features, axis=1)

    return features.numpy().tolist()

def process_images_in_colab(model, folder_path, target_size=(299, 299), batch_size=32):
    image_data = []
    img_paths = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)
                 if filename.endswith((".jpg", ".jpeg", ".png"))]

    start_time = time.time()

    for i, img_path in enumerate(img_paths):
        try:
            features = process_image(img_path, model, target_size=target_size)

            image_data.append({
                'ImageName': os.path.basename(img_path),
                'Features': features
            })

            if (i + 1) % batch_size == 0:
                elapsed_time = time.time() - start_time
                print(f"Processed {i + 1} images in {elapsed_time:.2f} seconds")

        except Exception as e:
            print(f"Error processing image {img_path}: {str(e)}")

    return image_data

def save_to_csv(image_data, csv_file_name):
    with open(csv_file_name, mode='w', newline='') as csvfile:
        fieldnames = ['ImageName', 'Features']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for data in image_data:
            writer.writerow(data)

drive.mount('/content/drive')

base_model = InceptionV3(weights='imagenet', include_top=False)

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
x = Dense(512, activation='relu')(x)
output = Dense(2048, activation='linear')(x)

model = Model(inputs=base_model.input, outputs=output)

folder_path = '/content/drive/MyDrive/images'
image_data = process_images_in_colab(model, folder_path, target_size=(299, 299), batch_size=32)
csv_file_name = '/content/drive/MyDrive/image_data_fine_tuned.csv'
save_to_csv(image_data, csv_file_name)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed 2144 images in 869.09 seconds
Error processing image /content/drive/MyDrive/images/image_5119.png: image file is truncated
Processed 2176 images in 880.97 seconds
Processed 2208 images in 893.24 seconds
Processed 2240 images in 906.68 seconds
Processed 2272 images in 920.16 seconds
Processed 2304 images in 933.14 seconds
Processed 2336 images in 946.99 seconds
Processed 2368 images in 958.68 seconds
Processed 2400 images in 971.42 seconds
Processed 2432 images in 984.40 seconds
Processed 2464 images in 997.69 seconds
Processed 2496 images in 1010.98 seconds
Processed 2528 images in 1024.21 seconds
Processed 2560 images in 1036.67 seconds
Processed 2592 images in 1047.99 seconds
Processed 2624 images in 1060.76 seconds
Processed 2656 images in 1073.89 seconds
Processed 2688 images in 1086.51 seconds
Processed 2720 images in 1099.34 seconds
Processed 2752 images in 1110.79 seconds
Processed 2784 images in 1121.78 

In [None]:
import pandas as pd

# Load the CSV files into pandas DataFrames
labels_df = pd.read_csv('/content/drive/MyDrive/labels.csv')
image_data_df = pd.read_csv('/content/drive/MyDrive/image_data_fine_tuned.csv')

# Merge the DataFrames based on the 'image_name' and 'ImageName' columns
combined_df = pd.merge(labels_df, image_data_df, left_on='image_name', right_on='ImageName', how='inner')

# Select only the columns of interest
result_df = combined_df[['image_name', 'text_ocr', 'text_corrected', 'overall_sentiment', 'Features']]

# Save the result to a new CSV file
result_df.to_csv('/content/drive/MyDrive/combined_data.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

# Load the combined dataset
df = pd.read_csv('/content/drive/MyDrive/combined_data.csv')

# Preprocess the sentiment labels for multi-class classification
le = LabelEncoder()
df['overall_sentiment'] = le.fit_transform(df['overall_sentiment'])

# Split the dataset into features (X) and labels (y)
X = np.vstack(df['Features'].apply(eval).values)
y = df['overall_sentiment'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build a sentiment analysis model for multi-class classification
num_classes = len(le.classes_)
model = Sequential([
    Dense(512, activation='relu', input_dim=X_train_scaled.shape[1]),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.00001), loss='categorical_crossentropy', metrics=['accuracy'])

# Convert labels to one-hot encoding for multi-class classification
y_train_one_hot = to_categorical(y_train, num_classes)
y_test_one_hot = to_categorical(y_test, num_classes)

# Train the model
model.fit(X_train_scaled, y_train_one_hot, epochs=50, batch_size=100, validation_split=0.2)

# Evaluate the model
accuracy = model.evaluate(X_test_scaled, y_test_one_hot)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 47.13%


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping

# Load the combined dataset
df = pd.read_csv('/content/drive/MyDrive/combined_data.csv')

# Preprocess the sentiment labels for multi-class classification
le = LabelEncoder()
df['overall_sentiment'] = le.fit_transform(df['overall_sentiment'])

# Split the dataset into features (X) and labels (y)
X = np.vstack(df['Features'].apply(eval).values)
y = df['overall_sentiment'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Learning rate scheduler
def lr_schedule(epoch):
    return 0.00001 * 0.9**epoch

lr_scheduler = LearningRateScheduler(lr_schedule)

# Early stopping
early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

# Build a sentiment analysis model for multi-class classification
num_classes = len(le.classes_)
model = Sequential([
    Dense(512, activation='relu', input_dim=X_train_scaled.shape[1]),
    BatchNormalization(),
    Dropout(0.5),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

# Compile the model with RMSprop optimizer and categorical crossentropy loss
model.compile(optimizer=RMSprop(learning_rate=0.00001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model with learning rate scheduler and early stopping
model.fit(X_train_scaled, y_train, epochs=100, batch_size=100, validation_split=0.2, callbacks=[lr_scheduler, early_stopping])

# Evaluate the model
accuracy = model.evaluate(X_test_scaled, y_test)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Test Accuracy: 33.43%


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords

# Load your dataset
# Assuming df is your DataFrame with columns: 'Features' and 'overall_sentiment'
df = pd.read_csv('/content/drive/MyDrive/combined_data.csv')

# Convert 'overall_sentiment' to one-hot encoding
df_one_hot = pd.get_dummies(df['overall_sentiment'], prefix='sentiment')

# Concatenate the one-hot encoding columns with the original DataFrame
df = pd.concat([df, df_one_hot], axis=1)

# Extract features and labels
X = df['Features']
y = df['overall_sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Text cleaning and preprocessing
stop_words = set(stopwords.words('english'))
X_train = X_train.apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
X_test = X_test.apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# Convert text data to numerical features using TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Use SMOTE to address class imbalance on the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

# Initialize a Support Vector Machine (SVM) classifier and perform grid search
param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}
svm_classifier = SVC()
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5)
grid_search.fit(X_train_smote, y_train_smote)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# Train the SVM classifier with the best parameters
svm_classifier = SVC(**best_params)
svm_classifier.fit(X_train_smote, y_train_smote)

# Make predictions on the test set
svm_predictions = svm_classifier.predict(X_test_tfidf)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_classification_rep = classification_report(y_test, svm_predictions)

print('SVM Model:')
print(f'Accuracy: {svm_accuracy}')
print('Classification Report:')
print(svm_classification_rep)

# Try a Random Forest classifier as well
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_smote, y_train_smote)
rf_predictions = rf_classifier.predict(X_test_tfidf)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_classification_rep = classification_report(y_test, rf_predictions)

print('Random Forest Model:')
print(f'Accuracy: {rf_accuracy}')
print('Classification Report:')
print(rf_classification_rep)


Best Parameters: {'C': 10, 'kernel': 'rbf'}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM Model:
Accuracy: 0.07890961262553801
Classification Report:
               precision    recall  f1-score   support

     negative       0.07      0.98      0.14       103
      neutral       0.00      0.00      0.00       405
     positive       0.41      0.01      0.03       657
very_negative       0.00      0.00      0.00        23
very_positive       0.00      0.00      0.00       206

     accuracy                           0.08      1394
    macro avg       0.10      0.20      0.03      1394
 weighted avg       0.20      0.08      0.02      1394

Random Forest Model:
Accuracy: 0.02295552367288379
Classification Report:
               precision    recall  f1-score   support

     negative       0.00      0.00      0.00       103
      neutral       0.00      0.00      0.00       405
     positive       0.41      0.01      0.03       657
very_negative       0.02      1.00      0.03        23
very_positive       0.00      0.00      0.00       206

     accuracy                   

In [None]:
import nltk
nltk.download('stopwords', download_dir='/usr/share/nltk_data')


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True