In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import collections
import itertools
import re
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import ast
from sklearn.metrics import mean_squared_error, r2_score
import scipy.sparse


In [2]:

review_data = pd.read_csv("C:\\Users\\26591\\Downloads\\yelp_data\\yelp_review_arizona.csv")

business_data = pd.read_csv("C:\\Users\\26591\\Downloads\\yelp_data\\yelp_business.csv")


In [3]:
# Create a binary sentiment label based on the stars given in reviews
# star over 3 means good, mark as sentiment 1, other wise is 0
review_data['sentiment'] = review_data['stars']

# Select relevant columns
data = review_data[['text', 'sentiment']]


In [4]:
data

Unnamed: 0,text,sentiment
0,Came here while in town for a country concert....,4.0
1,Best barbecue this side of the Mississippi!!!!...,5.0
2,Absolutely amazing. Think Chipotle for enchila...,5.0
3,I was really disappointed with my most recent ...,2.0
4,I grade sushi restaurants on 3 factors:\n- Qua...,4.0
...,...,...
26601,Met a friend here for lunch. It was not as pa...,3.0
26602,Love this place! One of the best local Mexican...,5.0
26603,I like the unique atmosphere of this place. bu...,4.0
26604,"Oh Ticoz, I have been through a lot with you. ...",2.0


In [5]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)

# Fit and transform the review texts
features = tfidf.fit_transform(data['text'])

# Get the target labels
labels = data['sentiment']

# Create train and test set
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


In [None]:
# Initialize the Support Vector Classifier
svm_model = SVC(kernel='linear') 

# Train the model
svm_model.fit(X_train, y_train)


In [None]:
# Predict on the test set
predictions = svm_model.predict(X_test)

# Evaluate the predictions
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))


In [None]:
def preprocess_text(text, tfidf_vectorizer):
    """Preprocess and convert text data to TF-IDF features."""
    # Assume text preprocessing like lowercasing, removing special characters has been done
    return tfidf_vectorizer.transform([text])

def predict_sentiment(new_text, model, tfidf_vectorizer):
    """Predict the sentiment of a new text using the trained SVM model."""
    # Transform the new text to the same feature space as the model was trained on
    text_features = preprocess_text(new_text, tfidf_vectorizer)
    
    # Use the model to predict
    prediction = model.predict(text_features)

    
    # Convert prediction to sentiment
    sentiment = prediction[0]
    return sentiment

# Example usage:
new_review_text = "I really like their cheese cake, it's great!"
sentiment = predict_sentiment(new_review_text, svm_model, tfidf)
print("The predicted sentiment of the review is:", sentiment)


In [None]:
# Load datasets

reviews = pd.read_csv("C:\\Users\\26591\\Downloads\\yelp_data\\yelp_review_arizona.csv")

businesses = pd.read_csv("C:\\Users\\26591\\Downloads\\yelp_data\\yelp_business.csv")


# Merge datasets on business_id
data = pd.merge(reviews, businesses, on='business_id')

def extract_attribute(text, attribute):
    """Check if the attribute is set to 'True' in the attributes string."""
    if pd.isna(text):
        return False
    pattern = rf"{attribute}': 'True'"
    return bool(re.search(pattern, text))

# Example attributes extraction
data['GoodForKids'] = data['attributes'].apply(lambda x: extract_attribute(x, 'GoodForKids'))
data['RestaurantsReservations'] = data['attributes'].apply(lambda x: extract_attribute(x, 'RestaurantsReservations'))
# Add similar lines for other attributes

# Prepare text data using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)
text_features = tfidf.fit_transform(data['text'])


In [None]:
labels = data[['GoodForKids', 'RestaurantsReservations']]  # Add other attributes as needed

X_train, X_test, y_train, y_test = train_test_split(text_features, labels, test_size=0.2, random_state=42)

# Use a MultiOutputClassifier with RandomForest
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100))
model.fit(X_train, y_train)

# Predict on the test set
predictions = model.predict(X_test)

# Print performance metrics
print("Accuracy Score:\n", accuracy_score(y_test, predictions))
# Evaluate the model using clas  12-  sification_report
print("Classification Report:\n", classification_report(y_test, predictions, zero_division=0))


In [None]:
def preprocess_text(text, tfidf_vectorizer):
    """Preprocess and convert text data to TF-IDF features."""
    # Assume the necessary text preprocessing like removing special characters, lowercasing, etc.
    return tfidf_vectorizer.transform([text])

def predict_attributes(new_text, model, tfidf_vectorizer):
    """Predict business attributes of a new text using the trained model."""
    # Transform the new text to the same feature space as the model was trained on
    text_features = preprocess_text(new_text, tfidf_vectorizer)
    
    # Use the model to predict
    predictions = model.predict(text_features)
    
    # Map predictions back to attribute names
    predicted_attributes = {label: pred for label, pred in zip(['GoodForKids', 'RestaurantsReservations'], predictions[0])}
    return predicted_attributes

# Example usage
new_review_text = "The place is great for families, very welcoming and spacious."
predicted_attributes = predict_attributes(new_review_text, model, tfidf)
print("Predicted Attributes:", predicted_attributes)


In [None]:
attributes = ['GoodForKids', 'RestaurantsReservations', 'BusinessAcceptsCreditCards', 'OutdoorSeating', 'WiFi']  # Add more attributes as needed

def extract_attribute(text, attribute):
    if pd.isna(text):
        return False
    pattern = rf"{attribute}': 'True'"
    return bool(re.search(pattern, text))

for attribute in attributes:
    data[attribute] = data['attributes'].apply(lambda x: extract_attribute(x, attribute))


In [None]:
attributes

In [None]:

# Prepare labels for all attributes
labels = data[attributes]

# Train the model
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100))
model.fit(X_train, y_train)


In [None]:
def predict_attributes(new_text, model, tfidf_vectorizer):
    # Preprocess and vectorize the text
    text_features = tfidf_vectorizer.transform([new_text])

    # Predict using the model
    predictions = model.predict(text_features)[0]

    # Create a dictionary of attribute predictions
    predicted_attributes = {attr: "Yes" if pred else "No" for attr, pred in zip(attributes, predictions)}
    return predicted_attributes

# Example usage
new_review_text = "This place has great service, accepts credit cards, and is good for kids. They have WiFi and outdoor seating."
predicted_attributes = predict_attributes(new_review_text, model, tfidf)
print("Predicted Attributes:", predicted_attributes)


In [None]:
def predict_attributes(new_text, model, tfidf_vectorizer):
    # Preprocess and vectorize the text
    text_features = tfidf_vectorizer.transform([new_text])

    # Predict using the model
    predictions = model.predict(text_features)[0]

    # Create a dictionary of attribute predictions
    predicted_attributes = {attr: "Yes" if pred else "No" for attr, pred in zip(attributes, predictions)}
    return predicted_attributes

# Example usage
new_review_text = "This place has great service, accepts credit cards, and is good for kids. They have WiFi and outdoor seating."
predicted_attributes = predict_attributes(new_review_text, model, tfidf)
print("Predicted Attributes:", predicted_attributes)


In [None]:

# Assuming 'attributes', 'hours', and 'categories' are stored in a complex format
# Here's a simplified example of how you might extract and encode these:

# Simplified extraction of attributes (assuming they are stored as strings of dictionaries)

def extract_attribute(attr_string, key):
    try:
        attr_dict = eval(attr_string)  # Convert string to dict
        return attr_dict.get(key, None)
    except:
        return None

attributes_of_interest = ['GoodForKids', 'RestaurantsReservations', 'BusinessParking', 'WiFi']
for attr in attributes_of_interest:
    business_data[attr] = business_data['attributes'].apply(extract_attribute, key=attr)

# Encoding categorical data
encoder = OneHotEncoder()
encoded_categories = encoder.fit_transform(business_data[['GoodForKids', 'RestaurantsReservations', 'BusinessParking', 'WiFi']])

# Handling 'hours' and 'categories' similarly would require additional preprocessing
# For simplicity, assume 'categories' is a comma-separated list:
business_data['categories'] = business_data['categories'].str.split(', ').apply(lambda x: x[0] if x else None)
encoded_categories = encoder.fit_transform(business_data[['categories']])

# Combine all features into a single feature matrix
features = scipy.sparse.hstack((encoded_categories, business_data[['is_open', 'review_count']]))

# Ratings as target
ratings = business_data['stars']


In [None]:
data['stars_x']

In [None]:
mlb = MultiLabelBinarizer()
business_data['categories'] = business_data['categories'].apply(lambda x: x.split(', ')[0] if isinstance(x, str) else None)
categories_encoded = mlb.fit_transform(data['categories'])
categories_df = pd.DataFrame(categories_encoded, columns=mlb.classes_)

# Join back to the main DataFrame (ensure indices align or reset index before doing this)
categories_df.columns = ['Category_' + col for col in categories_df.columns]
data = data.join(categories_df)


In [None]:

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, ratings, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)

# Predict ratings
predicted_ratings = model.predict(X_test)


In [None]:

print("RMSE:", mean_squared_error(y_test, predicted_ratings, squared=False))
print("R²:", r2_score(y_test, predicted_ratings))


In [None]:
business_data.columns

In [None]:
business_data['attributes'][2]

In [None]:

data = business_data
# Function to safely parse JSON-like strings into dictionaries
def parse_attributes(attr_str):
    try:
        # Assuming the attribute string is a valid JSON-like format
        return json.loads(attr_str.replace("'", '"'))  # Replace single quotes to double quotes for valid JSON
    except:
        return {}

# Parse the 'attributes' column
data['parsed_attributes'] = data['attribute'].apply(parse_attributes)

# Assuming 'data' has been loaded and column names verified
def parse_attributes(attr_str):
    try:
        # Safely evaluate the string as a dictionary
        return ast.literal_eval(attr_str) if pd.notna(attr_str) else {}
    except (SyntaxError, ValueError):
        return {}

# Parse the 'attributes' column
data['parsed_attributes'] = data['attributes'].apply(parse_attributes)

# Flatten all dictionaries and filter to get only keys where the value is 'True'
all_attributes = list(chain.from_iterable(
    [key for key, value in attrs.items() if value == 'True']
    for attrs in data['parsed_attributes']
    if attrs  # Ensure attrs is not empty
))

# Count the frequencies of each attribute
attribute_counts = Counter(all_attributes)

# Get the top 10 most frequent attributes
top_10_attributes = attribute_counts.most_common(10)

# Print the result
print("Top 10 most frequent attributes:")
for attribute, count in top_10_attributes:
    print(f"{attribute}: {count}")

In [None]:


def remove_char(txt):
    
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).lower().split()) 
    

# split all words in each list
def split_words(list_name):
    x = [attr.split() for attr in all]
    all_words = list(itertools.chain(*x))
    return (all_words)
def remove_true_false(sentence):
    # This pattern matches 'true' or 'false' in a case-insensitive manner
    pattern = r'\b(true|false)\b'
    # Replace the found words with an empty string
    cleaned_sentence = re.sub(pattern, '', sentence, flags=re.IGNORECASE)
    # Remove any excess whitespace that may have been left
    cleaned_sentence = re.sub(r'\s+', ' ', cleaned_sentence).strip()
    return cleaned_sentence
list_of_words = []  # Rename 'list' to 'list_of_words' to avoid using Python keyword 'list'
business_data['attributes'] = business_data['attributes'].fillna('')  # Fill NaN values with empty strings

for i in business_data['attributes']:
    all_words = remove_char(i)
    all_words = remove_true_false(all_words)
    list_of_words.extend([split_words(all_words)])  # Pass list to split_words

list_of_words



In [None]:
# Create counter, count each words in string 'all_words'
counts = collections.Counter(list_of_words)

# show the top 30 frequence words
counts.most_common(20)

In [None]:
def remove_true_false(sentence):
    # This pattern matches 'true' or 'false' in a case-insensitive manner
    pattern = r'\b(true|false)\b'
    # Replace the found words with an empty string
    cleaned_sentence = re.sub(pattern, '', sentence, flags=re.IGNORECASE)
    # Remove any excess whitespace that may have been left
    cleaned_sentence = re.sub(r'\s+', ' ', cleaned_sentence).strip()
    return cleaned_sentence
all = remove_char(data['attributes'][100])
all = remove_true_false(all)
all