In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load user data from CSV
user_data = pd.read_csv('C:\\Users\\ROG\\OneDrive\\Desktop\\Root\\ML\\user_data.csv')

# Load places data from CSV
places_data = pd.read_csv('C:\\Users\\ROG\\OneDrive\\Desktop\\Root\\ML\\places_data.csv')

# NLP Preprocessing function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    words = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    return ' '.join(filtered_words)

# Apply preprocessing to the activities and reviews in both datasets
user_data['Preferred Activities'] = user_data['Preferred Activities'].apply(preprocess_text)
places_data['latest_reviews'] = places_data['latest_reviews'].apply(preprocess_text)

# Combine the 'latest_reviews' column for better representation of the place description
places_data['Place Description'] = places_data['latest_reviews']

# Tokenization and padding for deep learning
tokenizer = Tokenizer()
tokenizer.fit_on_texts(user_data['Preferred Activities'].tolist() + places_data['Place Description'].tolist())

# Convert the texts to sequences
user_sequences = tokenizer.texts_to_sequences(user_data['Preferred Activities'])
place_sequences = tokenizer.texts_to_sequences(places_data['Place Description'])

# Pad sequences to ensure consistent input size
max_sequence_length = max(max(len(seq) for seq in user_sequences), max(len(seq) for seq in place_sequences))
user_padded_sequences = pad_sequences(user_sequences, maxlen=max_sequence_length)
place_padded_sequences = pad_sequences(place_sequences, maxlen=max_sequence_length)

# Build the CNN Model
embedding_dim = 100  # Size of the word embeddings
vocab_size = len(tokenizer.word_index) + 1

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=4),
    Flatten(),
    Dense(10, activation='relu'),
    Dense(len(places_data), activation='softmax')  # Output for places classification
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare the target data (convert place names to label encoded values)
label_encoder = LabelEncoder()
place_labels = label_encoder.fit_transform(places_data['name'])

# Simulate user labels for training (in practice, you will use real user feedback)
user_labels = [place_labels[i % len(place_labels)] for i in range(len(user_data))]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(user_padded_sequences, user_labels, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Predict function for recommendations
def recommend_places_cnn(user_index, model, user_data, places_data):
    user_sequence = user_padded_sequences[user_index].reshape(1, -1)
    predictions = model.predict(user_sequence)
    
    # Get top 5 recommendations
    top_5_indices = predictions[0].argsort()[-5:][::-1]
    recommended_places = places_data.iloc[top_5_indices]
    
    return recommended_places[['name', 'rating', 'latest_reviews']]

# Example: Recommend places for the first user
recommended_places = recommend_places_cnn(0, model, user_data, places_data)
print(f"Top 5 recommendations for {user_data.iloc[0]['Name']}:\n", recommended_places)

# Evaluate the model (precision, recall, F1-score) could be implemented similar to the previous version


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ROG\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ROG\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed

user_data = pd.read_csv('Visitors Preference Dataset.xlsx - user_data_version_3_10K_Users.csv')