<a href="https://colab.research.google.com/github/ChrisLouis9913/ICTExit/blob/main/Reteston7thNov.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("/content/mobile_review.csv")

# Columns to clean: include 'price_local' if present and also any object columns that contain digits
candidate_obj_cols = [c for c in df.select_dtypes(include="object").columns
                      if df[c].astype(str).str.contains(r"\d", na=False).any()]
cols_to_clean = [c for c in (["price_local"] + candidate_obj_cols) if c in df.columns]

def clean_numeric_series(s: pd.Series) -> pd.Series:
    # convert to string, handle negatives in parentheses, strip whitespace
    s2 = s.astype(str).str.strip()
    s2 = s2.replace({"nan": ""})  # pandas may have 'nan' strings
    s2 = s2.str.replace(r'^\((.*)\)$', r'-\1', regex=True)  # (123) -> -123
    # remove all characters except digits, dot and minus
    s2 = s2.str.replace(r'[^0-9\.\-]', '', regex=True)
    # empty strings -> NaN, then convert to numeric (coerce invalid)
    s2 = s2.replace('', np.nan)
    return pd.to_numeric(s2, errors="coerce")

# Clean each column and handle NaNs (fill with median or 0 if median is NaN)
for col in cols_to_clean:
    df[col] = clean_numeric_series(df[col])
    med = df[col].median(skipna=True)
    if np.isnan(med):
        df[col].fillna(0, inplace=True)
    else:
        df[col].fillna(med, inplace=True)

# Quick check
print("Cleaned columns:", cols_to_clean)
print(df[cols_to_clean].dtypes)
print(df[cols_to_clean].isna().sum())

# df is ready for further use

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(med, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(med, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

Cleaned columns: ['price_local', 'model', 'price_local', 'review_date']
price_local    float64
model          float64
price_local    float64
review_date    float64
dtype: object
price_local    0
model          0
price_local    0
review_date    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# determine feature columns (use existing cols_to_clean if available)
if 'cols_to_clean' in globals():
    feature_cols = [c for c in cols_to_clean if c in df.columns]
else:
    feature_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c != "Rating"]

if "price_local" not in df.columns:
    raise KeyError("Column 'Rating' not found in dataframe")

X = df[feature_cols].copy()
y = df["price_local"].copy()

# ensure no NaNs in features
for c in X.columns:
    if X[c].isna().any():
        med = X[c].median(skipna=True)
        X[c].fillna(med if not np.isnan(med) else 0, inplace=True)

# split, scale, train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression()
model.fit(X_train_scaled, y_train)

# print coefficients per feature
for name, coef in zip(X.columns, model.coef_):
    print(f"{name}: {coef:.6f}")
print("Intercept:", model.intercept_)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
import re

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Create sentiment mapping dictionary
sentiment_map = {
    1: "Positive",
    0: "Neutral",
    -1: "Negative"
}

def clean_text(text):
    """Clean and preprocess text data"""
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        text = ' '.join([word for word in text.split() if word not in stop_words])
        return text
    return ''

# Create Sentiment_Label column
df['Review_Length'] = df['review_length'].map(sentiment_map)

# Drop rows with missing review text
df = df.dropna(subset=['Review_body'])

# Clean the text
df['cleaned_text'] = df['Review_body'].apply(clean_text)

# Split features and target
X = df['cleaned_text']
y = df['Review_Length']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create and fit TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Example prediction
def predict_sentiment(text):
    cleaned = clean_text(text)
    tfidf_vector = tfidf.transform([cleaned])
    prediction = model.predict(tfidf_vector)
    return prediction[0]

# Test the model with a sample review
sample_text = "This product is amazing and works perfectly!"
print(f"\nSample text: {sample_text}")
print(f"Predicted sentiment: {predict_sentiment(sample_text)}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyError: ['Review_body']

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

# Initialize and fit the tokenizer
MAX_WORDS = 10000  # Maximum number of words to keep
MAX_LEN = 100     # Maximum length of each sequence

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

# Convert labels to categorical
label_dict = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
y_train_cat = to_categorical([label_dict[label] for label in y_train])
y_test_cat = to_categorical([label_dict[label] for label in y_test])

# Build the model
model = Sequential([
    Embedding(MAX_WORDS, 100, input_length=MAX_LEN),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(16, activation='relu'),
    Dense(3, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train_pad,
    y_train_cat,
    batch_size=32,
    epochs=5,
    validation_split=0.2
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_pad, y_test_cat)
print(f"\nTest accuracy: {test_accuracy:.4f}")

NameError: name 'X_train' is not defined

In [21]:
from sklearn.metrics import classification_report

# Make predictions with the trained model
predictions = model.predict(X_test_pad)
predicted_classes = predictions.argmax(axis=1)

# Convert numeric predictions back to labels
reverse_label_dict = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
predicted_labels = [reverse_label_dict[i] for i in predicted_classes]
actual_labels = [label for label in y_test]

# Print classification report for deep learning model
print("\nDeep Learning Model Classification Report:")
print(classification_report(actual_labels, predicted_labels))

# Function to predict sentiment using the deep learning model
def predict_sentiment_dl(text):
    # Preprocess the text
    cleaned = clean_text(text)
    # Convert to sequence
    sequence = tokenizer.texts_to_sequences([cleaned])
    # Pad sequence
    padded = pad_sequences(sequence, maxlen=MAX_LEN)
    # Make prediction
    pred = model.predict(padded)
    # Get predicted class
    pred_class = pred.argmax(axis=1)[0]
    return reverse_label_dict[pred_class]

# Test the deep learning model with the same sample
print("\nDeep Learning Model Prediction:")
print(f"Sample text: {sample_text}")
print(f"Predicted sentiment: {predict_sentiment_dl(sample_text)}")

NameError: name 'model' is not defined

#Last question: I don't know ðŸ˜ž