In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load user data from CSV
user_data = pd.read_csv('C:\\Users\\ROG\\OneDrive\\Desktop\\Root\\ML\\user_data.csv')
places_data = pd.read_csv('C:\\Users\\ROG\\OneDrive\\Desktop\\Root\\ML\\places_data.csv')

# NLP Preprocessing function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    words = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    return ' '.join(filtered_words)

# Apply preprocessing to the activities and reviews in both datasets
user_data['Preferred Activities'] = user_data['Preferred Activities'].apply(preprocess_text)
places_data['latest_reviews'] = places_data['latest_reviews'].apply(preprocess_text)

# Combine the 'latest_reviews' column for better representation of the place description
places_data['Place Description'] = places_data['latest_reviews']

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()

# Combine all user activities and place descriptions into a single text corpus for vectorization
combined_text = pd.concat([user_data['Preferred Activities'], places_data['Place Description']])
tfidf_matrix = vectorizer.fit_transform(combined_text)

# Split the matrix back into user and place matrices
user_tfidf = tfidf_matrix[:len(user_data)-1]
place_tfidf = tfidf_matrix[len(user_data):]

# Encode place names as target labels
label_encoder = LabelEncoder()
place_labels = label_encoder.fit_transform(places_data['name'])

# Train-test split (simulate user preferences with place labels)
X_train, X_test, y_train, y_test = train_test_split(user_tfidf.toarray(), place_labels, test_size=0.2, random_state=42)

# Build the Neural Network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),  # Prevent overfitting
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(len(places_data), activation='softmax')  # Softmax for multi-class classification (places)
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Function to recommend places using the Neural Network
def recommend_places_nn(user_index, model, user_tfidf, place_data, label_encoder):
    # Get the user's TF-IDF vector and reshape for prediction
    user_vector = user_tfidf[user_index].toarray().reshape(1, -1)
    
    # Predict the probabilities of each place
    predictions = model.predict(user_vector)
    
    # Get the top 5 places with the highest probabilities
    top_5_indices = predictions[0].argsort()[-5:][::-1]
    
    # Decode the predicted place indices back to place names
    recommended_places = label_encoder.inverse_transform(top_5_indices)
    
    # Return the recommended places
    return place_data[place_data['name'].isin(recommended_places)][['name', 'rating', 'latest_reviews']]

# Example: Recommend places for the first user
recommended_places = recommend_places_nn(0, model, user_tfidf, places_data, label_encoder)
print(f"Top 5 recommendations for {user_data.iloc[0]['Name']}:\n", recommended_places)

# Example: Evaluate model performance
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Model Evaluation - Loss: {loss}, Accuracy: {accuracy}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ROG\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ROG\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: Found input variables with inconsistent numbers of samples: [10000, 411]