<a href="https://colab.research.google.com/github/Dharmateja180/school-website-/blob/main/image_correctness.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import cv2
import requests
from io import BytesIO
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input
import joblib  # Import joblib directly
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load your dataset (replace '/Amazon.csv' with the actual path to your dataset)
data = pd.read_csv('/content/Amazon.csv')

# Extract and preprocess image links
image_links = data['img_link']

# Initialize lists to store image features and corresponding URLs
importedImages = []
image_urls = []

# Download, preprocess, and store image features
for img_url in image_links:
    response = requests.get(img_url)
    image = np.asarray(bytearray(response.content), dtype="uint8")
    image_bgr = cv2.imdecode(image, cv2.IMREAD_COLOR)
    image_bgr = cv2.resize(image_bgr, (224, 224))
    numpy_image = img_to_array(image_bgr)
    image_batch = preprocess_input(np.expand_dims(numpy_image, axis=0))
    importedImages.append(image_batch)
    image_urls.append(img_url)

# Convert images to an array
images = np.vstack(importedImages)

# Load a pre-trained VGG16 model for feature extraction
vgg_model = VGG16(weights='imagenet')

from tensorflow.keras.models import Model

# Create a feature extractor model from VGG16
feat_extractor = Model(inputs=vgg_model.input, outputs=vgg_model.get_layer("fc2").output)

# Extract image features
processed_imgs = preprocess_input(images.copy())
imgs_features = feat_extractor.predict(processed_imgs)






In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarities between images
cos_similarities = cosine_similarity(imgs_features)

# Store the results in a Pandas DataFrame
cos_similarities_df = pd.DataFrame(cos_similarities, columns=image_urls, index=image_urls)

# Display the cosine similarity DataFrame
print(cos_similarities_df)

from sklearn.cluster import KMeans
# Define the number of clusters (you may adjust this based on your needs)
num_clusters = 5

# Convert cosine similarities DataFrame to a NumPy array
cos_sim_matrix = cos_similarities_df.to_numpy()

# Apply K-Means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
cluster_labels = kmeans.fit_predict(cos_sim_matrix)

# Add cluster labels to the original DataFrame
cos_similarities_df['cluster'] = cluster_labels

# Display the clusters
clustered_images = cos_similarities_df[['cluster']]

# Print the number of images in each cluster
cluster_counts = clustered_images['cluster'].value_counts()
print("Number of images in each cluster:")
print(cluster_counts)

# Calculate a threshold for identifying outliers (you can adjust this as needed)
threshold = 0.2  # Adjust the threshold based on your requirements

# Identify outliers based on the threshold
outliers = cos_similarities_df[cos_similarities_df < threshold].dropna(how='all')

# Display the outliers
print("Outliers (listings with cosine similarity below the threshold):")
print(outliers)

# Optionally, you can store or process the outlier listings as needed.

def assess_image_correctness(row):
    # Implement your image correctness assessment logic here
    # For example, you could check if the image URL is valid or if the image exists.
    # You can customize this function based on your criteria.

    image_url = row['img_link']  # Assuming 'img_link' is the column with image URLs

    # Perform image assessment here
    # You can use libraries like requests to check the image URL
    # and determine if the image is valid or not
    try:
        response = requests.head(image_url)
        if response.status_code == 200:
            return True
        else:
            return False
    except requests.exceptions.RequestException as e:
        return False

# Apply the image assessment function to your dataset
data['image_correct'] = data.apply(assess_image_correctness, axis=1)

# You can filter your DataFrame to get only the listings with correct images:
correct_listings = data[data['image_correct']]
# To get the listings with incorrect images:
incorrect_listings = data[~data['image_correct']]

# Assume you have a DataFrame 'data' with user feedback and 'image_correct' column
# The 'image_correct' column should indicate whether each listing's image is correct (True) or not (False).

# Define a function to analyze user feedback and image correctness
def analyze_user_feedback(row):
    # Implement your user feedback analysis logic here
    if row['image_correct']:
        # If the image is already marked as correct, return 'Correct'
        return 'Correct'
    else:
        # Here you can implement additional logic to consider user feedback
        # For example, if user feedback suggests an issue with the image, mark it as 'Incorrect'
        # You can also use additional user feedback data if available in your DataFrame
        # In this example, we're using a hypothetical 'user_feedback' column
        user_feedback = row['user_feedback']  # Assuming 'user_feedback' is the column with user feedback
        if "incorrect image" in user_feedback.lower():
            return 'Incorrect'
        else:
            return 'Needs Review'  # If neither correct nor incorrect feedback is provided

# Apply the user feedback analysis function to your dataset
data['image_feedback'] = data.apply(analyze_user_feedback, axis=1)

# To get listings marked as 'Correct' based on the analysis:
correct_listings = data[data['image_feedback'] == 'Correct']

# To get listings marked as 'Incorrect' based on the analysis:
incorrect_listings = data[data['image_feedback'] == 'Incorrect']

# To get listings marked as 'Needs Review' based on the analysis:
needs_review_listings = data[data['image_feedback'] == 'Needs Review']

# Define a function to assess overall image quality and correctness
def assess_image_quality(row):
    if row['image_correct'] and row['image_feedback'] == 'Correct':
        return 'High Quality'
    elif not row['image_correct'] and row['image_feedback'] == 'Incorrect':
        return 'Low Quality (Misleading)'
    elif not row['image_correct'] and row['image_feedback'] == 'Needs Review':
        return 'Low Quality (Needs Review)'
    else:
        return 'Unknown Quality'

# Apply the image quality assessment function to your dataset
data['image_quality'] = data.apply(assess_image_quality, axis=1)

# To get listings with High Quality Images:
high_quality_listings = data[data['image_quality'] == 'High Quality']

# To get listings with Low Quality (Misleading) Images:
low_quality_misleading = data[data['image_quality'] == 'Low Quality (Misleading)']
# To get listings with Low Quality (Needs Review) Images:
low_quality_needs_review = data[data['image_quality'] == 'Low Quality (Needs Review)']
# To get listings with Unknown Quality:
unknown_quality_listings = data[data['image_quality'] == 'Unknown Quality']

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = imgs_features
y = data['image_correct'].values

print(X.shape)
print(y.shape)

def your_indexing_logic(common_id, data):
    # Implement the logic to find the index in X that corresponds to the common_id
    # This example assumes you have a DataFrame 'data' with a 'product_id' column
    try:
        index = data[data['product_id'] == common_id].index[0]
        return index
    except IndexError:
        # Handle the case where the common_id is not found
        return None

def match_data_based_on_common_index(X, y, common_index, data):
    X_matched = []
    y_matched = []

    for common_id in common_index:
        # Implement your custom indexing logic to find the index in X
        # This may vary depending on your specific dataset structure
        x_index = your_indexing_logic(common_id, data)  # Replace with your actual indexing logic

        if x_index is not None and x_index < len(y):
            X_matched.append(X[x_index])
            y_matched.append(y[x_index])

    return np.array(X_matched), np.array(y_matched)

# Define your common index (replace 'product_id' with the actual column name)
common_index = data['product_id']

# Match X and y based on the common index
X, y = match_data_based_on_common_index(X, y, common_index, data)

# Identify non-numeric values in the 'rating' column
non_numeric_indices = ~data['rating'].str.replace('.', '', 1).str.isnumeric()
# Optionally, you can print or inspect the rows with non-numeric values
print(data[non_numeric_indices])
# Remove rows with non-numeric values from the DataFrame
data = data[~non_numeric_indices]
# Convert the 'rating' column to a numerical (float) data type
data['rating'] = data['rating'].astype(float)
# Define your threshold as a float
threshold_rating = 4.0
# Create a binary target variable
data['target'] = (data['rating'] > threshold_rating).astype(int)

# Remove commas from 'rating_count' and convert it to float
data['rating_count'] = data['rating_count'].str.replace(',', '').astype(float)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,precision_score, recall_score, f1_score, roc_auc_score

# Define your threshold as a float
threshold_rating = 4.0
# Create a binary target variable
data['target'] = (data['rating'] > threshold_rating).astype(int)

# Split your data into training and testing sets
X = data[['rating', 'rating_count']]  # Exclude 'discounted_price'
y = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Count the number of rows with missing values in X_train
rows_with_missing_values = X_train.isna().sum(axis=1)

# Calculate the total number of rows with missing values
num_rows_with_missing_values = len(rows_with_missing_values[rows_with_missing_values > 0])

print("Number of rows with missing values in X_train:", num_rows_with_missing_values)

# Remove rows with missing values
X_train = X_train.dropna()
y_train = y_train[X_train.index]  # Update the labels accordingly

# Now, you can proceed to train your logistic regression model

model = LogisticRegression()
model.fit(X_train, y_train)

joblib.dump(model, 'image_correctness_model.pkl')

# Make predictions on the test set
y_pred = model.predict(X_test)
# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Predict using the trained model
y_pred = model.predict(X_test)

# Confusion Matrix
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification Report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

# Precision Score
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.2f}")

# Recall Score
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.2f}")

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")

# ROC AUC Score
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
print(f"ROC AUC Score: {roc_auc:.2f}")



                                                    https://m.media-amazon.com/images/I/31+NwZ8gb1L._SX300_SY300_.jpg  \
https://m.media-amazon.com/images/I/31+NwZ8gb1L...                                           1.000000                   
https://m.media-amazon.com/images/W/WEBP_402378...                                           0.996949                   
https://m.media-amazon.com/images/I/31Wb+A3VVdL...                                           0.518979                   
https://m.media-amazon.com/images/I/418YrbHVLCL...                                           0.391592                   
https://m.media-amazon.com/images/I/31iFF1KbkpL...                                           0.308028                   
...                                                                                               ...                   
https://m.media-amazon.com/images/I/41sJ4KQa5xL...                                           0.295199                   
https://m.media-amazon.com/image



Number of images in each cluster:
3    407
2    403
1    252
0    233
4    170
Name: cluster, dtype: int64
Outliers (listings with cosine similarity below the threshold):
                                                    https://m.media-amazon.com/images/I/31+NwZ8gb1L._SX300_SY300_.jpg  \
https://m.media-amazon.com/images/I/31+NwZ8gb1L...                                                NaN                   
https://m.media-amazon.com/images/W/WEBP_402378...                                                NaN                   
https://m.media-amazon.com/images/I/31Wb+A3VVdL...                                                NaN                   
https://m.media-amazon.com/images/I/418YrbHVLCL...                                                NaN                   
https://m.media-amazon.com/images/I/31iFF1KbkpL...                                                NaN                   
...                                                                                               ...  

  non_numeric_indices = ~data['rating'].str.replace('.', '', 1).str.isnumeric()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rating'] = data['rating'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['target'] = (data['rating'] > threshold_rating).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rating_c