# Projet d'Analyse du Tourisme au Maroc

# Étape 1 – Scraping Reddit

### voir le fichier : citimes_tourism_morocco_scraper.py

# Étape 2 – Nettoyage des données

In [248]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import re

# Load data
df = pd.read_csv("./Data/scraped_data.csv")
print(f"Original size: {len(df)}")

df = df.drop_duplicates(subset=['text'])

Original size: 128076


In [249]:
# Liste étendue des phrases/mots spam à supprimer
spam_phrases = [
    # Sites commerciaux suspects (basé sur vos résultats)
    "dhgate.com", "dhgate com", "sale dhgate", "http sale",
    "trendsupplier.com", "trendsupplier com", "clearance inventory",
    "amazon https", "regexr tech", "com clearance",
    "kofi regexr", "patreon regexr", "tech support","remix","wav","itunes",
    "imgur com", "bandcamp com","Uganda",
    
    # Termes commerciaux
    "brand new", "new sealed", "sealed niche", "tester box",
    "tester tester", "tester cap", "box niche", "cap niche",
    "box cap", "niche creed", "maison francis kurkdjian",
    "buy developer", "coffee https", "monthly https",
    "support project", "developer coffee",
    
    # URLs et liens
    "https www", "https site", "http://", "https://",
    "www.", ".com", ".net", ".org", "bit.ly", "tinyurl",
    "goo.gl", "t.co", "preview redd", "auto webp", "width format",
    
    # Phrases commerciales/promotionnelles
    "click here", "buy now", "limited time offer", "special discount",
    "free shipping", "act now", "call now", "order today",
    "for sale", "best price", "discount", "clearance","moderators subreddit message","subreddit message compose","automatically contact moderators",
    "bot action performed","action performed automatically","contact moderators subreddit",
    
    # Contenu générique/répétitif
    "lorem ipsum", "placeholder text", "sample text",
    "test content", "dummy text", "explore rated",
    
    # Contenu non-touristique
    "cryptocurrency", "bitcoin", "forex trading", "make money online",
    "work from home", "diet pills", "weight loss", "itunes vudu","naxos","naxos naxos naxos",
    
    # Répétitions/caractères suspects
    "aaaaa", "!!!!!", "????", "-----"
]

In [250]:
tourism_keywords = [
    #  Destinations

    #  Voyage / Tourisme général
    'travel', 'traveler', 'travelling', 'tourism', 'tourist', 'vacation', 'holiday',
    'trip', 'journey', 'itinerary', 'visit', 'tour', 'guide', 'planning',
    'backpacking', 'destination', 'passport', 'visa', 'adventure',

    #  Attractions & Activités
    'medina', 'kasbah', 'kasbahs', 'mosque', 'palace', 'madrasa', 'museum',
    'souk', 'bazaar', 'market', 'hammam', 'desert', 'sahara', 'camel', 'trek',
    'camping', 'quad', 'sandboarding', 'sunset', 'sunrise', 'oasis', 'atlas',
    'mountain', 'mountains', 'valley', 'beach', 'coast', 'sea', 'ocean', 'waterfall',

    #  Hébergement & Transport
    'hotel', 'riad', 'hostel', 'accommodation', 'guesthouse', 'airbnb',
    'resort', 'check-in', 'room', 'lodge', 'tent', 'camp',
    'airport', 'flight', 'transport', 'bus', 'taxi', 'train', 'car rental',
    'drive', 'roadtrip',

    #  Gastronomie & Services
    'restaurant', 'food', 'cuisine', 'tajine', 'couscous', 'mint tea', 'street food',
    'dinner', 'breakfast', 'lunch', 'local dish', 'meal', 'taste', 'drink', 'snack',

    #  Culture & Expérience
    'culture', 'cultural', 'heritage', 'tradition', 'traditional', 'local',
    'authentic', 'berber', 'arabic', 'craft', 'artisan', 'music', 'festival',
    'performance', 'experience', 'vibe', 'hospitality', 'welcoming', 'people'
]

In [251]:
# Fonction améliorée pour vérifier si le contenu contient du spam
def contains_spam(text):
    if pd.isna(text):
        return True  # Considérer les valeurs nulles comme spam
    
    text_lower = text.lower().strip()
    
    # Supprimer les textes trop courts (probablement pas informatifs)
    if len(text_lower) < 30:
        return True
    
    # Vérifier chaque phrase spam
    for spam_phrase in spam_phrases:
        if spam_phrase.lower() in text_lower:
            return True
    
    # Vérifier si le contenu contient au moins un mot-clé touristique
    has_tourism_keyword = any(keyword.lower() in text_lower for keyword in tourism_keywords)
    
    # Si aucun mot-clé touristique n'est trouvé, considérer comme spam
    if not has_tourism_keyword:
        return True
    
    # Vérifier le ratio de caractères alphabétiques (éviter les textes avec trop de chiffres/symboles)
    alpha_ratio = sum(c.isalpha() or c.isspace() for c in text_lower) / len(text_lower)
    if alpha_ratio < 0.7:  # Moins de 70% de caractères alphabétiques
        return True
    
    return False

In [252]:
# Fonction pour vérifier si le contenu est vraiment touristique
def is_tourism_content(text):
    if pd.isna(text):
        return False
    
    text_lower = text.lower()
    
    # Compter les mots-clés touristiques
    tourism_score = sum(1 for keyword in tourism_keywords if keyword.lower() in text_lower)
    
    # Doit avoir au moins 2 mots-clés touristiques pour être considéré comme contenu touristique
    return tourism_score >= 2

In [253]:
# Appliquer le filtre anti-spam amélioré
print("Filtering spam content with enhanced detection...")

# Combine title and text FIRST (before spam filtering)
df["content"] = (df["title"].fillna('') + ' ' + df["text"].fillna(''))
# Appliquer le filtre anti-spam sur le contenu combiné
spam_mask = df['content'].apply(contains_spam)
spam_count = spam_mask.sum()
print(f"Found {spam_count} spam/non-tourism entries to remove")

# Supprimer le contenu spam
df_clean = df[~spam_mask].copy()
print(f"After spam filtering: {len(df_clean)}")

Filtering spam content with enhanced detection...
Found 83466 spam/non-tourism entries to remove
After spam filtering: 38497


In [254]:
# Appliquer un filtre supplémentaire pour s'assurer que c'est vraiment du contenu touristique
tourism_mask = df_clean['content'].apply(is_tourism_content)
tourism_count = tourism_mask.sum()
non_tourism_count = len(df_clean) - tourism_count

print(f"Tourism content found: {tourism_count}")
print(f"Non-tourism content removed: {non_tourism_count}")

# Garder seulement le contenu touristique
df = df_clean[tourism_mask].copy()
print(f"Final tourism dataset size: {len(df)}")

Tourism content found: 18659
Non-tourism content removed: 19838
Final tourism dataset size: 18659


In [255]:
# Final cleaning - remove duplicates and very short content
df = df.drop_duplicates(subset=['content'])
df = df[df["content"].str.len() > 50]  # Augmenter le seuil minimum
print(f"After removing duplicates and short posts: {len(df)}")

After removing duplicates and short posts: 18562


In [256]:
city_mapping = {
    'marrakesh': 'Marrakech',
    'marrakech': 'Marrakech',
    'fes': 'Fès',
    'fès': 'Fès',
    'merzouga': 'Merzouga',
    'imlil': 'Imlil',
    'tizi n’oucheg': 'Tizi n’Oucheg',  # uniformiser les apostrophes aussi
}

# Appliquer le mapping après avoir tout mis en minuscules
df['city'] = df['city'].str.lower().map(city_mapping).fillna(df['city'])

# remettre les majuscules au début
df['city'] = df['city'].str.title()

In [257]:
# Liste des villes connues (grandes villes touristiques)
villes = [
    'Marrakech', 'Fès', 'Casablanca', 'Rabat', 'Agadir', 'Tanger',
    'Meknès', 'Essaouira', 'Ouarzazate', 'El Jadida', 'Kenitra',
    'Ifrane', 'Al Hoceima', 'Nador', 'Saidia', 'Tetouan', 'Taroudant',
    'Zagora', 'Errachidia',"Safi","Dakhla","Chefchaouen"
]

# Création d'une nouvelle colonne pour type de lieu
df['lieu_type'] = df['city'].apply(lambda x: 'Ville' if x in villes else 'Village')

# Étape 3 – Enrichissement des données

## Sentiment analysis avec TextBlob 

In [260]:
from textblob import TextBlob

# Fonction de détection du sentiment
def get_sentiment(text):
    blob = TextBlob(str(text))
    polarity = blob.sentiment.polarity
    if polarity > 0.1:
        return "Positif"
    elif polarity < -0.1:
        return "Négatif"
    else:
        return "Neutre"

In [261]:
df['sentiment'] = df['content'].apply(get_sentiment)

## Classification thématique :

In [263]:
# Tourism terms analysis
tourism_terms = {
      'Attractions Naturelles': [
        'desert', 'sahara', 'dunes', 'oasis', 'valley',
        'atlas', 'anti-atlas', 'mountains', 'high atlas', 'middle atlas',
        'beach', 'coast', 'sea', 'ocean', 'waterfall', 'agafay', 'palm grove',
        'nature', 'canyon', 'gorge'
    ],

    'Sites Culturels': [
        'medina', 'kasbah', 'kasbahs', 'mosque', 'koutoubia', 'palace', 'bahia',
        'el badi', 'madrasa', 'museum', 'souk', 'bazaar', 'market', 'hammam',
        'fortress', 'ramparts', 'ruins', 'old town', 'unesco site', 'architecture'
    ],

    'Activités': [
        'tour', 'trip', 'visit', 'guide', 'guided tour', 'excursion',
        'trekking', 'hiking', 'quad', 'camel ride', 'camel trekking', 'camping',
        'shopping', 'exploring', 'photography','hammam',
        'surfing', 'spa', 'wellness', 'cooking class', 'henna', 'yoga','gnawa'
    ],

    'Hébergement': [
        'hotel', 'riad', 'hostel', 'guesthouse', 'accommodation', 'stay',
        'room', 'suite', 'lodge', 'camp', 'tent', 'resort', 'airbnb',
        'booking', 'check-in', 'check out', 'reception'
    ],

    'Nourriture & Boissons': [
        'food', 'restaurant', 'cuisine', 'gastronomy', 'tajine', 'tagine',
        'couscous', 'mint tea', 'mint', 'tea', 'coffee', 'street food',
        'bread', 'mechoui', 'pastilla', 'sweets', 'pastry', 'breakfast',
        'dinner', 'meal', 'snack', 'drink', 'orange juice'
    ],

    'Transport': [
        'airport', 'flight', 'train', 'bus', 'taxi', 'car', 'car rental',
        'driving', 'tgv', 'ctm', 'supratours', 'petit taxi', 'grand taxi',
        'motorbike', 'scooter', 'road trip', 'transport', 'public transport'
    ],
    
    'Sécurité': [
        'safety', 'police', 'emergency', 'scam', 'pickpocket', 'theft',
        'travel advisory', 'security check', 'border control', 'visa',
        'passport', 'travel insurance', 'health', 'vaccination', 'covid',
        'first aid', 'local laws', 'customs', 'authorities', 'crime',
        'safe areas', 'unsafe areas', 'night safety', 'solo travel',
        'female travel', 'child safety', 'crowds', 'protest', 'demonstration'
    ]
}

def classify_themes_multi(text):
    matched_categories = set()
    text = str(text).lower()
    
    for category, keywords in tourism_terms.items():
        for keyword in keywords:
            if keyword.lower() in text:
                matched_categories.add(category)
    
    if matched_categories:
        return list(matched_categories)
    else:
        return ["Autre"]


df['themes'] = df['content'].apply(classify_themes_multi)

## Géolocalisation :

In [265]:
def load_coords():
    coords_df = pd.read_csv("./Data/coordonnees_villes.csv")
    return coords_df

coords = load_coords()
df = df.merge(coords, on="city", how="left")

In [266]:
cols_to_keep = ['city', 'content', 'lieu_type','themes', 'sentiment','latitude','longitude']
df_cleaned = df[cols_to_keep]

In [267]:
# Liste des villes à supprimer (avec toutes les variantes possibles)
villes_a_supprimer = [
    "M’Hamid El Ghizlane", 
    "M'Hamid El Ghizlane", 
    "Aït Ben Haddou"
]

# Filtrage en une seule ligne
df_cleaned = df[~df['city'].isin(villes_a_supprimer)]
print(df_cleaned['city'].unique())

['Marrakech' 'Fès' 'Casablanca' 'Rabat' 'Agadir' 'Chefchaouen' 'Essaouira'
 'Ouarzazate' 'Tanger' 'Meknès' 'Taghazout' 'Dakhla' 'Safi' 'Merzouga'
 'Imlil' 'Tetouan' 'Imsouane' 'Akchour' 'Ourika' 'Bhalil' 'Tizi N’Oucheg'
 'Tameslouht' 'Asni' 'Tamegroute' 'Ksar El Khorbat' 'Mirleft'
 'Sidi Kaouki' 'Moulay Idriss Zerhoun' 'Ouirgane' 'Tafraoute' 'Tamnougalt'
 'Skoura' 'Aguergour' 'Douar Samra' 'Aremd' 'Bou Tharar' 'Taddart' 'Tidli'
 'Imilchil' 'El Jadida' 'Kenitra' 'Ifrane' 'Al Hoceima' 'Nador' 'Saidia'
 'Volubilis' 'Taroudant' 'Zagora' 'Errachidia']


In [268]:
df_cleaned.to_csv('./Data/maroc_villes.csv', index=False, encoding='utf-8')
print("✅ Exporté avec succès !")

✅ Exporté avec succès !


## Analyse statistique du dataset

In [270]:
# ========== SPAM DETECTION STATISTICS ==========
print(f"\n📊 Enhanced Spam Filtering Statistics:")
print(f"   Original entries: {len(df) + spam_count + non_tourism_count:,}")
print(f"   Spam entries removed: {spam_count:,}")
print(f"   Non-tourism entries removed: {non_tourism_count:,}")
print(f"   Final tourism entries retained: {len(df):,}")
print(f"   Total removal percentage: {((spam_count + non_tourism_count) / (len(df) + spam_count + non_tourism_count) * 100):.2f}%")

if spam_count > 0:
    print(f"\n🚫 Most common spam phrases detected:")
    # Recompute spam detection for statistics
    original_df = pd.read_csv("./Data/scraped_data.csv")
    original_content = (original_df["title"].fillna('') + ' ' + original_df["text"].fillna(''))
    
    spam_phrase_counts = {}
    for phrase in spam_phrases:
        # Use simple string matching instead of regex to avoid special character issues
        count = original_content.str.lower().str.contains(phrase.lower(), regex=False, na=False).sum()
        if count > 0:
            spam_phrase_counts[phrase] = count
    
    # Show top spam phrases found
    sorted_spam = sorted(spam_phrase_counts.items(), key=lambda x: x[1], reverse=True)
    for phrase, count in sorted_spam[:15]:  # Show top 15
        print(f"   '{phrase}': {count:,} times")

print(f"\n✅ Enhanced tourism analysis with n-grams and spam filtering complete!")


📊 Enhanced Spam Filtering Statistics:
   Original entries: 121,866
   Spam entries removed: 83,466
   Non-tourism entries removed: 19,838
   Final tourism entries retained: 18,562
   Total removal percentage: 84.77%

🚫 Most common spam phrases detected:
   'https://': 10,681 times
   '.com': 7,813 times
   'www.': 6,017 times
   't.co': 4,005 times
   'wav': 1,147 times
   'http://': 900 times
   '.org': 804 times
   '.net': 491 times
   'discount': 444 times
   'clearance': 412 times
   '-----': 352 times
   'click here': 205 times
   'aaaaa': 204 times
   'best price': 203 times
   'brand new': 182 times

✅ Enhanced tourism analysis with n-grams and spam filtering complete!


In [271]:
# ========== UNIGRAMS ANALYSIS ==========
print("\n" + "="*60)
print("📊 UNIGRAMS ANALYSIS")
print("="*60)

# Simple Count Vectorizer with library stop words
vectorizer = CountVectorizer(
    stop_words='english',
    ngram_range=(1, 1),
    min_df=3,
    lowercase=True,
    token_pattern=r'[a-zA-Z]{3,}'
)

count_matrix = vectorizer.fit_transform(df["content"])
feature_names = vectorizer.get_feature_names_out()
word_counts = count_matrix.sum(axis=0).A1
word_freq = sorted(zip(feature_names, word_counts), key=lambda x: x[1], reverse=True)

print("\n🏆 Most frequent words (Top 30):")
for i, (term, count) in enumerate(word_freq[:30], 1):
    print(f"{i:2d}. {term:15} {count:6,} times")


📊 UNIGRAMS ANALYSIS

🏆 Most frequent words (Top 30):
 1. like            13,905 times
 2. just            12,914 times
 3. people          10,484 times
 4. time             9,756 times
 5. don              8,017 times
 6. day              6,531 times
 7. morocco          6,147 times
 8. know             6,025 times
 9. think            5,787 times
10. really           5,756 times
11. good             5,297 times
12. want             5,209 times
13. way              4,905 times
14. going            4,403 times
15. trip             4,319 times
16. make             4,246 times
17. did              4,128 times
18. city             3,962 times
19. things           3,912 times
20. didn             3,755 times
21. lot              3,727 times
22. said             3,685 times
23. nthe             3,648 times
24. days             3,601 times
25. experience       3,570 times
26. need             3,515 times
27. got              3,450 times
28. great            3,397 times
29. visit            3

In [272]:
# ========== BIGRAMS ANALYSIS ==========
print("\n" + "="*60)
print("📊 BIGRAMS ANALYSIS")
print("="*60)

# Vectorizer pour les bigrammes
bigram_vectorizer = CountVectorizer(
    stop_words='english',
    ngram_range=(2, 2),
    min_df=3,
    lowercase=True,
    token_pattern=r'[a-zA-Z]{3,}'
)

bigram_matrix = bigram_vectorizer.fit_transform(df["content"])
bigram_features = bigram_vectorizer.get_feature_names_out()
bigram_counts = bigram_matrix.sum(axis=0).A1
bigram_freq = sorted(zip(bigram_features, bigram_counts), key=lambda x: x[1], reverse=True)

print(f"\n🏆 Most frequent bigrams (Top 30):")
for i, (bigram, count) in enumerate(bigram_freq[:30], 1):
    print(f"{i:2d}. {bigram:25} {count:6,} times")


📊 BIGRAMS ANALYSIS

🏆 Most frequent bigrams (Top 30):
 1. don know                   1,014 times
 2. feel like                    763 times
 3. don think                    660 times
 4. don want                     583 times
 5. day trip                     545 times
 6. make sure                    504 times
 7. years ago                    499 times
 8. felt like                    436 times
 9. atlas mountains              382 times
10. feels like                   324 times
11. just like                    281 times
12. looks like                   276 times
13. let know                     274 times
14. spend time                   273 times
15. lot people                   271 times
16. just want                    255 times
17. didn want                    251 times
18. don need                     251 times
19. people don                   251 times
20. year old                     247 times
21. don like                     242 times
22. don really                   241 times

In [273]:
# ========== TRIGRAMS ANALYSIS ==========
print("\n" + "="*60)
print("📊 TRIGRAMS ANALYSIS")
print("="*60)

# Vectorizer pour les trigrammes
trigram_vectorizer = CountVectorizer(
    stop_words='english',
    ngram_range=(3, 3),
    min_df=2,  # Plus faible pour les trigrammes car ils sont moins fréquents
    lowercase=True,
    token_pattern=r'[a-zA-Z]{3,}'
)

trigram_matrix = trigram_vectorizer.fit_transform(df["content"])
trigram_features = trigram_vectorizer.get_feature_names_out()
trigram_counts = trigram_matrix.sum(axis=0).A1
trigram_freq = sorted(zip(trigram_features, trigram_counts), key=lambda x: x[1], reverse=True)

print(f"\n🏆 Most frequent trigrams (Top 30):")
for i, (trigram, count) in enumerate(trigram_freq[:30], 1):
    print(f"{i:2d}. {trigram:35} {count:6,} times")


📊 TRIGRAMS ANALYSIS

🏆 Most frequent trigrams (Top 30):
 1. ait ben haddou                          84 times
 2. best time visit                         83 times
 3. sidi bou said                           68 times
 4. unesco world heritage                   57 times
 5. high atlas mountains                    56 times
 6. contact moderators subreddit            54 times
 7. moderators subreddit message            54 times
 8. president joseph aoun                   54 times
 9. subreddit message compose               54 times
10. action performed automatically          53 times
11. automatically contact moderators        53 times
12. bot action performed                    53 times
13. performed automatically contact         53 times
14. doesn make sense                        48 times
15. world heritage site                     47 times
16. runs away cornucopia                    45 times
17. just make sure                          44 times
18. don really know                       

In [274]:
# ========== N-GRAMS STATISTICS ==========
print("\n" + "="*60)
print("📈 N-GRAMS STATISTICS")
print("="*60)

print(f"\n📊 N-grams Statistics:")
print(f"   Total unique unigrams: {len(word_freq):,}")
print(f"   Total unique bigrams: {len(bigram_freq):,}")
print(f"   Total unique trigrams: {len(trigram_freq):,}")


📈 N-GRAMS STATISTICS

📊 N-grams Statistics:
   Total unique unigrams: 29,545
   Total unique bigrams: 69,508
   Total unique trigrams: 77,814


In [275]:
# ========== TOURISM TERMS ANALYSIS ==========
print("\n" + "="*60)
print("🎯 TOURISM CATEGORIES ANALYSIS")
print("="*60)




word_dict = dict(word_freq)

print("\n🎯 Tourism Categories - Word Occurrences:")
for category, terms in tourism_terms.items():
    print(f"\n{category}:")
    all_terms = [(term, word_dict.get(term, 0)) for term in terms]
    all_terms.sort(key=lambda x: x[1], reverse=True)
    
    for term, count in all_terms:
        if count > 0:
            print(f"   {term:12} {count:6,} times")
        else:
            print(f"   {term:12} {count:6,} times (not found)")

# ========== TOURISM-RELATED N-GRAMS ==========
print("\n" + "="*60)
print("🏖️ TOURISM-RELATED N-GRAMS")
print("="*60)

# Filtrer les bigrammes liés au tourisme
tourism_bigrams = []
for bigram, count in bigram_freq:
    words = bigram.split()
    if any(word in ['morocco', 'marrakech', 'casablanca', 'travel', 'tour', 'visit', 'hotel', 'desert', 'medina', 'trip', 'tourist', 'tourism'] for word in words):
        tourism_bigrams.append((bigram, count))

print(f"\n🏖️ Tourism-related bigrams (Top 20):")
for i, (bigram, count) in enumerate(tourism_bigrams[:20], 1):
    print(f"{i:2d}. {bigram:25} {count:6,} times")

# Filtrer les trigrammes liés au tourisme
tourism_trigrams = []
for trigram, count in trigram_freq:
    words = trigram.split()
    if any(word in ['morocco', 'marrakech', 'casablanca', 'travel', 'tour', 'visit', 'hotel', 'desert', 'medina', 'trip', 'tourist', 'tourism'] for word in words):
        tourism_trigrams.append((trigram, count))

print(f"\n🏖️ Tourism-related trigrams (Top 20):")
for i, (trigram, count) in enumerate(tourism_trigrams[:20], 1):
    print(f"{i:2d}. {trigram:35} {count:6,} times")


print(f"   Tourism-related bigrams found: {len(tourism_bigrams):,}")
print(f"   Tourism-related trigrams found: {len(tourism_trigrams):,}")


🎯 TOURISM CATEGORIES ANALYSIS

🎯 Tourism Categories - Word Occurrences:

Attractions Naturelles:
   desert        1,481 times
   beach         1,389 times
   mountains       943 times
   sahara          610 times
   atlas           600 times
   nature          581 times
   sea             560 times
   coast           505 times
   valley          447 times
   ocean           305 times
   dunes           245 times
   gorge           150 times
   oasis           127 times
   waterfall        75 times
   agafay           75 times
   canyon           56 times
   anti-atlas        0 times (not found)
   high atlas        0 times (not found)
   middle atlas      0 times (not found)
   palm grove        0 times (not found)

Sites Culturels:
   medina          870 times
   market          771 times
   museum          566 times
   palace          379 times
   mosque          358 times
   architecture    329 times
   ruins           234 times
   kasbah          174 times
   souk            154 t

# Étape 4 – Visualisation avec Streamlit

### Lancer l'application web streamlit : (dashboard.py)