In [None]:
import pandas as pd
import re

# Load data from CSV file
file_path = 'C:/Users/Bhargav/TravelAgentChatBot/chat-agent/scrapped.csv' 
df = pd.read_csv(file_path, header=None, names=['Raw Data'])

# Function to extract features with detailed debugging
def extract_features(row):
    # Regex patterns
    pattern_general = r'Restaurant (?P<name>.+) proche (?P<location>.+) - Restaurant - (?P=location) - (?P<cuisine>.+)'
    pattern_specific = r'(?P<name>.+) - (?P<address>.+), (?P<postal_code>\d{5}), (?P<city>[^-]+) - (?P<cuisine>.+)'

    # Debug: print the current row being processed
    print("Processing row:", row['Raw Data'])
    
    # Match general pattern
    match_general = re.match(pattern_general, row['Raw Data'])
    if match_general:
        print("General match found:", match_general.groupdict())  # Debug: print matched groups
        return {
            'Name': match_general.group('name'),
            'Address': None,
            'Postal Code': None,
            'City': match_general.group('location').split(' ')[-1],
            'District': match_general.group('location').split()[-1],
            'Cuisine Type': match_general.group('cuisine')
        }
    
    # Match specific pattern
    match_specific = re.match(pattern_specific, row['Raw Data'])
    if match_specific:
        print("Specific match found:", match_specific.groupdict())  # Debug: print matched groups
        return {
            'Name': match_specific.group('name'),
            'Address': match_specific.group('address'),
            'Postal Code': match_specific.group('postal_code'),
            'City': match_specific.group('city').strip(),
            'District': match_specific.group('postal_code')[-2:],
            'Cuisine Type': match_specific.group('cuisine')
        }
    
    # If neither pattern matches, print a debug message
    print("No match found for this row.")
    return {
        'Name': None,
        'Address': None,
        'Postal Code': None,
        'City': None,
        'District': None,
        'Cuisine Type': None
    }

# Apply feature extraction
features = df.apply(extract_features, axis=1, result_type='expand')

# Print the first few rows of extracted features to debug
print("Extracted Features DataFrame:")
print(features.head())

# Concatenate features to the original DataFrame
df = pd.concat([df, features], axis=1)

# Drop the raw data column
df.drop(columns=['Raw Data'], inplace=True)

# Save to CSV
output_csv_path = 'C:/Users/Bhargav/TravelAgentChatBot/extracted_features.csv'
df.to_csv(output_csv_path, index=False)

print(f"Processed data saved to {output_csv_path}")
print(df.head())  # Display the first few rows of the DataFrame


Processing row: Restaurant Poulet proche Paris 13ème - Restaurant - Paris 13ème - Poulet
General match found: {'name': 'Poulet', 'location': 'Paris 13ème', 'cuisine': 'Poulet'}
Processing row: Restaurant Pizza proche Paris 13ème - Restaurant - Paris 13ème - Pizza
General match found: {'name': 'Pizza', 'location': 'Paris 13ème', 'cuisine': 'Pizza'}
Processing row: Restaurant Pad Thai proche Paris 13ème - Restaurant - Paris 13ème - Pad Thai
General match found: {'name': 'Pad Thai', 'location': 'Paris 13ème', 'cuisine': 'Pad Thai'}
Processing row: Restaurant Sushi proche Paris 13ème - Restaurant - Paris 13ème - Sushi
General match found: {'name': 'Sushi', 'location': 'Paris 13ème', 'cuisine': 'Sushi'}
Processing row: Restaurant Couscous proche Paris 13ème - Restaurant - Paris 13ème - Couscous
General match found: {'name': 'Couscous', 'location': 'Paris 13ème', 'cuisine': 'Couscous'}
Processing row: Restaurant Pâtes proche Paris 13ème - Restaurant - Paris 13ème - Pâtes
General match found: