In [None]:
"""
Malaysia Airlines Competitive Analysis - Natural Language Processing
==================================================================
Applied NLP techniques for customer sentiment and competitive language pattern analysis.
Scope: Text mining across 8,137 reviews with aviation-specific preprocessing and sentiment analysis
Methods: VADER sentiment analysis, bigram network graphs, TF-IDF distinctiveness, aspect-based sentiment
Key Features: Enhanced stopwords, sentiment-weighted networks, airline-specific language patterns
Findings: Sentiment gap -0.154 vs industry, distinctive cultural terms, service aspect performance gaps
Visualization: Network graphs, word clouds, TF-IDF charts, aspect sentiment comparison matrices
"""

In [None]:
## Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
from collections import Counter, defaultdict
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud
import networkx as nx
from matplotlib.patches import Patch
warnings.filterwarnings('ignore')

In [None]:
## Download required NLTK data
# Run once only then comment out (docstring), Uncomment on first run
"""
try:
    nltk.download('punkt_tab', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('vader_lexicon', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
    nltk.download('omw-1.4', quiet=True)
except:
    print("NLTK downloads may require internet connection")
"""

In [None]:
## Read cleaned dataset
df = pd.read_csv(r"CLEAN_CSV PATH HERE") # cleaned_csv PATH here

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.width', None)

In [None]:
## Advanced Text Preprocessing
def advanced_text_preprocessing(df):
    print("=== TEXT PREPROCESSING ===")
    
    # Initialize NLP tools
    sia = SentimentIntensityAnalyzer()
    
    # Stop words
    stop_words = set(stopwords.words('english'))
    airline_stopwords = {
        'airline', 'airlines', 'flight', 'flights', 'plane', 'aircraft',
        'airport', 'time', 'hour', 'hours', 'day', 'trip', 'travel', 
        'fly', 'flying', 'passenger', 'passengers', 'would', 'could',
        'really', 'quite', 'very', 'much', 'many', 'also', 'well',
        'get', 'got', 'go', 'went', 'come', 'came', 'take', 'took',
        'one', 'two', 'first', 'last', 'back', 'way', 'class', 'business',
        'economy', 'malaysia', 'singapore', 'qatar', 'emirates', 'airasia',
        'kuala', 'lumpur', 'doha', 'dubai', 'changi', 'klia'
    }
    stop_words.update(airline_stopwords)
    
    def clean_text(text):
        if pd.isna(text):
            return ""
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = ' '.join(text.split())
        return text
    
    def extract_bigrams(text):
        if not text:
            return []
        tokens = word_tokenize(text)
        filtered_tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
        bigrams = list(ngrams(filtered_tokens, 2))
        return [' '.join(bigram) for bigram in bigrams]
    
    # Apply preprocessing
    df['review_clean'] = df['review'].apply(clean_text)
    df['bigrams'] = df['review_clean'].apply(extract_bigrams)
    
    # Sentiment analysis
    sentiment_scores = df['review'].apply(lambda x: sia.polarity_scores(str(x)) if pd.notna(x) else {'compound': 0})
    df['sentiment_score'] = sentiment_scores.apply(lambda x: x['compound'])
    
    def categorize_sentiment(score):
        if score >= 0.1:
            return 'Positive'
        elif score <= -0.1:
            return 'Negative'
        else:
            return 'Neutral'
    
    df['sentiment_category'] = df['sentiment_score'].apply(categorize_sentiment)
    
    print(f"Processed {len(df):,} reviews")
    print(df['sentiment_category'].value_counts())
    
    return df

In [None]:
## Bigram Network Graph
def create_bigram_network_graph(df):
    print("\n=== BIGRAM NETWORK GRAPH ANALYSIS ===")
    
    # Focus on Malaysia Airlines for detailed analysis
    if 'malaysia_airlines' in df['airline'].values:
        mab_data = df[df['airline'] == 'malaysia_airlines']
        
        # Collect all bigrams with sentiment
        bigram_sentiment = {}
        bigram_frequency = Counter()
        
        for idx, row in mab_data.iterrows():
            sentiment = row['sentiment_score']
            for bigram in row['bigrams']:
                if len(bigram) > 6:  # Filter meaningful bigrams
                    bigram_frequency[bigram] += 1
                    if bigram not in bigram_sentiment:
                        bigram_sentiment[bigram] = []
                    bigram_sentiment[bigram].append(sentiment)
        
        # Filter to most frequent bigrams
        top_bigrams = {bigram: freq for bigram, freq in bigram_frequency.most_common(30)}
        
        # Calculate average sentiment for each bigram
        bigram_avg_sentiment = {}
        for bigram in top_bigrams:
            if bigram in bigram_sentiment:
                bigram_avg_sentiment[bigram] = np.mean(bigram_sentiment[bigram])
        
        # Create network graph
        G = nx.Graph()
        
        # Add nodes (individual words)
        word_connections = defaultdict(list)
        for bigram in top_bigrams:
            words = bigram.split()
            if len(words) == 2:
                word1, word2 = words
                G.add_edge(word1, word2, weight=top_bigrams[bigram])
                word_connections[word1].append((word2, top_bigrams[bigram]))
                word_connections[word2].append((word1, top_bigrams[bigram]))
        
        # Create visualization
        plt.figure(figsize=(16, 12))
        
        # Calculate node sizes based on total connections
        node_sizes = {}
        for node in G.nodes():
            total_weight = sum(G[node][neighbor]['weight'] for neighbor in G.neighbors(node))
            node_sizes[node] = total_weight * 100
        
        # Calculate node colors based on average sentiment
        node_colors = []
        for node in G.nodes():
            # Find sentiment for bigrams containing this word
            node_sentiments = []
            for bigram, sentiment in bigram_avg_sentiment.items():
                if node in bigram.split():
                    node_sentiments.append(sentiment)
            
            if node_sentiments:
                avg_sentiment = np.mean(node_sentiments)
                node_colors.append(avg_sentiment)
            else:
                node_colors.append(0)
        
        # Graph layout
        np.random.seed(42)
        pos = nx.spring_layout(G, k=3, iterations=50)
        
        # Draw network
        nodes = nx.draw_networkx_nodes(G, pos, node_size=[node_sizes.get(node, 300) for node in G.nodes()], node_color=node_colors, cmap='RdYlGn', vmin=-0.5, vmax=0.5, alpha=0.8)
        
        # Draw edges with thickness based on frequency
        edges = G.edges()
        weights = [G[u][v]['weight'] for u, v in edges]
        nx.draw_networkx_edges(G, pos, width=[w/5 for w in weights], alpha=0.6, edge_color='gray')
        
        # Draw labels
        nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')
        
        plt.title('Malaysia Airlines: Customer Language Network\n(Node size=frequency, Color=sentiment)', fontsize=16, fontweight='bold', pad=20)
        
        # Add colorbar
        sm = plt.cm.ScalarMappable(cmap='RdYlGn', norm=plt.Normalize(vmin=-0.5, vmax=0.5))
        sm.set_array([])
        cbar = plt.colorbar(sm, ax=plt.gca())
        cbar.set_label('Average Sentiment', fontweight='bold')
        
        plt.axis('off')
        plt.tight_layout()
        plt.show()
        
        # Print top insights
        print("=== TOP CONNECTED WORD PAIRS (Malaysia Airlines) ===")
        for bigram, freq in bigram_frequency.most_common(10):
            sentiment = bigram_avg_sentiment.get(bigram, 0)
            sentiment_label = "Positive" if sentiment > 0.1 else "Negative" if sentiment < -0.1 else "Neutral"
            print(f"  '{bigram}': {freq} mentions ({sentiment_label})")
    
    return bigram_frequency, bigram_avg_sentiment

In [None]:
## TF-IDF Weighted Bar Charts
def create_tfidf_analysis(df):
    print("\n=== TF-IDF IMPORTANCE ANALYSIS ===")
    
    # Prepare documents by airline
    airline_documents = {}
    airline_sentiments = {}
    
    for airline in df['airline'].unique():
        airline_data = df[df['airline'] == airline]
        # Combine all reviews for this airline
        combined_text = ' '.join(airline_data['review_clean'].fillna(''))
        airline_documents[airline] = combined_text
        # Calculate average sentiment
        airline_sentiments[airline] = airline_data['sentiment_score'].mean()
    
    # TF-IDF Analysis
    vectorizer = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.8, ngram_range=(1, 2), stop_words='english')
    
    documents = list(airline_documents.values())
    airlines = list(airline_documents.keys())
    
    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    
    # Get top terms for each airline
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('TF-IDF Analysis: Most Distinctive Terms by Airline', fontsize=16, fontweight='bold')
    
    airlines_to_plot = airlines[:4]  # Top 4 airlines
    
    for idx, airline in enumerate(airlines_to_plot):
        row, col = idx // 2, idx % 2
        
        # Get TF-IDF scores for this airline
        airline_idx = airlines.index(airline)
        tfidf_scores = tfidf_matrix[airline_idx].toarray()[0]
        
        # Get top 10 terms
        top_indices = tfidf_scores.argsort()[-10:][::-1]
        top_terms = [feature_names[i] for i in top_indices]
        top_scores = [tfidf_scores[i] for i in top_indices]
        
        # Calculate sentiment for each term
        term_sentiments = []
        for term in top_terms:
            # Find reviews containing this term
            term_reviews = df[(df['airline'] == airline) & (df['review_clean'].str.contains(term, na=False))]
            if len(term_reviews) > 0:
                term_sentiment = term_reviews['sentiment_score'].mean()
                term_sentiments.append(term_sentiment)
            else:
                term_sentiments.append(0)
        
        # Create color map based on sentiment
        colors = ['red' if s < -0.1 else 'green' if s > 0.1 else 'orange' for s in term_sentiments]
        
        # Create horizontal bar chart
        y_pos = np.arange(len(top_terms))
        bars = axes[row, col].barh(y_pos, top_scores, color=colors, alpha=0.7)
        
        axes[row, col].set_yticks(y_pos)
        axes[row, col].set_yticklabels(top_terms)
        axes[row, col].set_xlabel('TF-IDF Score')
        
        airline_name = airline.replace('_', ' ').title()
        avg_sentiment = airline_sentiments[airline]
        axes[row, col].set_title(f'{airline_name}\nAvg Sentiment: {avg_sentiment:.3f}', fontweight='bold')
        
        # Add sentiment scores as text
        for i, (score, sentiment) in enumerate(zip(top_scores, term_sentiments)):
            axes[row, col].text(score + 0.001, i, f'{sentiment:.2f}', va='center', fontsize=8)
    
    # Hide empty subplots
    for idx in range(len(airlines_to_plot), 4):
        row, col = idx // 2, idx % 2
        axes[row, col].axis('off')
    
    # Add legend
    legend_elements = [
        Patch(facecolor='green', alpha=0.7, label='Positive Sentiment'),
        Patch(facecolor='orange', alpha=0.7, label='Neutral Sentiment'),
        Patch(facecolor='red', alpha=0.7, label='Negative Sentiment')
    ]
    fig.legend(handles=legend_elements, loc='lower right', bbox_to_anchor=(0.98, 0.95))
    
    plt.tight_layout()
    plt.show()
    
    # Print insights
    print("=== TF-IDF Insights ===")
    for airline in airlines_to_plot:
        airline_idx = airlines.index(airline)
        tfidf_scores = tfidf_matrix[airline_idx].toarray()[0]
        top_indices = tfidf_scores.argsort()[-5:][::-1]
        top_terms = [feature_names[i] for i in top_indices]
        
        print(f"{airline.replace('_', ' ').title()} - Most Distinct Terms:")
        for term in top_terms:
            print(f"  {term}")
    
    return feature_names, tfidf_matrix

In [None]:
## Service Aspect Sentiment Analysis
def analyze_service_aspects_sentiment(df):
    print("\n=== SERVICE ASPECT SENTIMENT ANALYSIS ===")
    
    # Define service aspects with keywords
    service_aspects = {
        'Crew': ['crew', 'staff', 'attendant', 'steward', 'stewardess', 'cabin crew'],
        'Food': ['food', 'meal', 'dining', 'breakfast', 'lunch', 'dinner', 'cuisine'],
        'Seat': ['seat', 'seating', 'comfort', 'legroom', 'space', 'chair'],
        'Check-in': ['checkin', 'check-in', 'boarding', 'gate', 'counter'],
        'Lounge': ['lounge', 'waiting', 'terminal', 'amenity', 'facility'],
        'Refund': ['refund', 'money', 'compensation', 'reimburse', 'cancel']
    }
    
    # Calculate aspect sentiment for each airline
    aspect_sentiment_data = []
    
    for airline in df['airline'].unique():
        airline_data = df[df['airline'] == airline]
        
        for aspect, keywords in service_aspects.items():
            # Find reviews mentioning this aspect
            aspect_reviews = airline_data[airline_data['review_clean'].str.contains('|'.join(keywords), na=False, case=False)]
            
            if len(aspect_reviews) > 0:
                avg_sentiment = aspect_reviews['sentiment_score'].mean()
                review_count = len(aspect_reviews)
                
                aspect_sentiment_data.append({
                    'airline': airline, 
                    'aspect': aspect, 
                    'sentiment': avg_sentiment, 
                    'review_count': review_count
                })
    
    aspect_df = pd.DataFrame(aspect_sentiment_data)
    
    if len(aspect_df) > 0:
        # Create grouped bar chart
        fig, ax = plt.subplots(figsize=(16, 8))
        
        # Pivot data for plotting
        pivot_data = aspect_df.pivot(index='aspect', columns='airline', values='sentiment')
        
        # Create grouped bars with explicit color mapping
        colors = {
            'emirates': 'red',
            'malaysia_airlines': 'blue', 
            'qatar_airways': 'maroon',
            'singapore_airlines': 'gold'
        }
        
        # Plot with manual color control
        pivot_data.plot(
            kind='bar', 
            ax=ax, 
            width=0.8, 
            alpha=0.8,
            color=[colors.get(col, 'gray') for col in pivot_data.columns]
        )
        
        ax.set_title('Service Aspect Sentiment Analysis by Airline', fontsize=16, fontweight='bold')
        ax.set_xlabel('Service Aspect', fontweight='bold')
        ax.set_ylabel('Average Sentiment Score', fontweight='bold')
        ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5, alpha=0.7)
        ax.grid(axis='y', alpha=0.3)
        ax.legend(title='Airline', bbox_to_anchor=(1.05, 1), loc='upper left')
        
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        # Print insights
        print("=== SERVICE ASPECT PERFORMANCE (Malaysia Airlines vs Competitors) ===")
        if 'malaysia_airlines' in pivot_data.columns:
            mab_scores = pivot_data['malaysia_airlines'].dropna()
            
            for aspect in mab_scores.index:
                mab_score = mab_scores[aspect]
                competitor_scores = pivot_data.loc[aspect].drop('malaysia_airlines').dropna()
                
                if len(competitor_scores) > 0:
                    avg_competitor = competitor_scores.mean()
                    gap = mab_score - avg_competitor
                    
                    status = "Better" if gap > 0.05 else "Worse" if gap < -0.05 else "Similar"
                    print(f"  {aspect}: {mab_score:.3f} vs {avg_competitor:.3f} (Gap: {gap:+.3f}) - {status}")
    
    return aspect_df

In [None]:
## Word Cloud
def create_sanitized_wordcloud(df):
    print("\n=== WORD CLOUD GENERATION ===")
    
    # Enhanced stopwords for cleaner results
    enhanced_stopwords = set(stopwords.words('english'))
    enhanced_stopwords.update({
        'flight', 'flights', 'airline', 'airlines', 'aircraft', 'plane',
        'airport', 'time', 'hour', 'hours', 'day', 'trip', 'travel',
        'passenger', 'passengers', 'would', 'could', 'really', 'quite',
        'very', 'much', 'many', 'also', 'well', 'get', 'got', 'go',
        'went', 'come', 'came', 'take', 'took', 'one', 'two', 'way',
        'back', 'first', 'last', 'class', 'business', 'economy',
        'malaysia', 'singapore', 'qatar', 'emirates', 'airasia',
        'kuala', 'lumpur', 'doha', 'dubai', 'changi', 'klia'
    })
    
    def extract_sentiment_bigrams(reviews, sentiment_type='all'):
        all_bigrams = []
        
        for review in reviews:
            if pd.isna(review):
                continue
            
            # Get sentiment score
            sentiment_score = SentimentIntensityAnalyzer().polarity_scores(str(review))['compound']
            
            # Filter by sentiment type
            if sentiment_type == 'positive' and sentiment_score < 0.1:
                continue
            elif sentiment_type == 'negative' and sentiment_score > -0.1:
                continue
            
            # Extract bigrams
            clean_text = re.sub(r'[^a-zA-Z\s]', '', str(review).lower())
            words = [word for word in clean_text.split() 
                    if word not in enhanced_stopwords and len(word) > 2]
            
            # Create bigrams
            for i in range(len(words) - 1):
                bigram = f"{words[i]} {words[i+1]}"
                if len(bigram) > 8:  # Filter meaningful bigrams
                    all_bigrams.append(bigram)
        
        return all_bigrams
    
    # Focus on Malaysia Airlines
    if 'malaysia_airlines' in df['airline'].values:
        mab_data = df[df['airline'] == 'malaysia_airlines']
        
        # Create figure with subplots
        fig, axes = plt.subplots(2, 2, figsize=(18, 12))
        fig.suptitle('Malaysia Airlines: Bigram Word Clouds', fontsize=16, fontweight='bold')
        
        # Overall word cloud
        all_bigrams = extract_sentiment_bigrams(mab_data['review'])
        if all_bigrams:
            bigram_freq = Counter(all_bigrams)
            filtered_bigrams = {bigram: count for bigram, count in bigram_freq.items() if count >= 2}
            
            if filtered_bigrams:
                wordcloud = WordCloud(width=400, height=300, background_color='white', colormap='viridis', max_words=50, relative_scaling=0.6, collocations=False
                                      ).generate_from_frequencies(filtered_bigrams)
                
                axes[0, 0].imshow(wordcloud, interpolation='bilinear')
                axes[0, 0].set_title(f'Overall Customer Language\n({len(all_bigrams)} bigrams)', fontweight='bold')
                axes[0, 0].axis('off')
        
        # Positive sentiment bigrams
        positive_bigrams = extract_sentiment_bigrams(mab_data['review'], 'positive')
        if positive_bigrams:
            pos_freq = Counter(positive_bigrams)
            filtered_pos = {bigram: count for bigram, count in pos_freq.items() if count >= 2}
            
            if filtered_pos:
                wordcloud_pos = WordCloud(width=400, height=300, background_color='white', colormap='Greens', max_words=40, relative_scaling=0.6, collocations=False
                                          ).generate_from_frequencies(filtered_pos)
                
                axes[0, 1].imshow(wordcloud_pos, interpolation='bilinear')
                axes[0, 1].set_title(f'Positive Experiences\n({len(positive_bigrams)} bigrams)', fontweight='bold', color='darkgreen')
                axes[0, 1].axis('off')
        
        # Negative sentiment bigrams
        negative_bigrams = extract_sentiment_bigrams(mab_data['review'], 'negative')
        if negative_bigrams:
            neg_freq = Counter(negative_bigrams)
            filtered_neg = {bigram: count for bigram, count in neg_freq.items() if count >= 2}
            
            if filtered_neg:
                wordcloud_neg = WordCloud(width=400, height=300, background_color='white', colormap='Reds', max_words=40, relative_scaling=0.6, collocations=False
                                          ).generate_from_frequencies(filtered_neg)
                
                axes[1, 0].imshow(wordcloud_neg, interpolation='bilinear')
                axes[1, 0].set_title(f'Negative Experiences\n({len(negative_bigrams)} bigrams)', fontweight='bold', color='darkred')
                axes[1, 0].axis('off')
        
        # Top competitive insights
        competitor_data = df[df['airline'].isin(['qatar_airways', 'singapore_airlines'])]
        high_rating_reviews = competitor_data[competitor_data['overall_rating'] >= 8]['review']
        competitor_bigrams = extract_sentiment_bigrams(high_rating_reviews, 'positive')
        
        if competitor_bigrams:
            comp_freq = Counter(competitor_bigrams)
            filtered_comp = {bigram: count for bigram, count in comp_freq.items() if count >= 3}
            
            if filtered_comp:
                wordcloud_comp = WordCloud(width=400, height=300, background_color='white', colormap='Blues', max_words=40, relative_scaling=0.6, collocations=False
                                           ).generate_from_frequencies(filtered_comp)
                
                axes[1, 1].imshow(wordcloud_comp, interpolation='bilinear')
                axes[1, 1].set_title(f'Competitor Excellence\n(Qatar/Singapore High Ratings)', fontweight='bold', color='darkblue')
                axes[1, 1].axis('off')
        
        plt.tight_layout()
        plt.show()
        
        # Print top insights
        print("=== MALAYSIA AIRLINES - TOP POSITIVE BIGRAMS ===")
        if positive_bigrams:
            for bigram, count in Counter(positive_bigrams).most_common(8):
                print(f"  '{bigram}': {count} mentions")
        
        print("\n=== MALAYSIA AIRLINES - TOP NEGATIVE BIGRAMS ===")
        if negative_bigrams:
            for bigram, count in Counter(negative_bigrams).most_common(8):
                print(f"  '{bigram}': {count} mentions")
        
        print("\n=== COMPETITOR EXCELLENCE PATTERNS ===")
        if competitor_bigrams:
            for bigram, count in Counter(competitor_bigrams).most_common(6):
                print(f"  '{bigram}': {count} mentions")

In [None]:
## Strategic Insights Summary
def generate_strategic_insights(df):
    print("\n=== MALAYSIA AIRLINES NLP INSIGHTS SUMMARY ===")
    
    if 'malaysia_airlines' in df['airline'].values:
        mab_data = df[df['airline'] == 'malaysia_airlines']
        
        # Overall sentiment performance
        mab_sentiment = mab_data['sentiment_score'].mean()
        industry_avg = df['sentiment_score'].mean()
        
        print(f"OVERALL SENTIMENT PERFORMANCE")
        print(f"  Malaysia Airlines: {mab_sentiment:.3f}")
        print(f"  Industry Average: {industry_avg:.3f}")
        print(f"  Performance Gap: {mab_sentiment - industry_avg:+.3f}")
        
        # Sentiment distribution
        sentiment_dist = mab_data['sentiment_category'].value_counts(normalize=True) * 100
        print(f"\nSENTIMENT DISTRIBUTION")
        for category, percentage in sentiment_dist.items():
            print(f"  {category}: {percentage:.1f}%")
        
        # Competitive ranking
        airline_sentiments = df.groupby('airline')['sentiment_score'].mean().sort_values(ascending=False)
        mab_rank = list(airline_sentiments.index).index('malaysia_airlines') + 1
        
        print(f"\nCOMPETITIVE POSITION")
        print(f"  Current Rank: #{mab_rank} out of {len(airline_sentiments)} airlines")
        
        print(f"\nRANKING")
        for idx, (airline, sentiment) in enumerate(airline_sentiments.items(), 1):
            print(f"  {idx}. {airline.replace('_', ' ').title()}: {sentiment:.3f}")

In [None]:
## Execute Complete NLP
def execute_advanced_nlp_pipeline(df):
    
    # Advanced preprocessing
    df = advanced_text_preprocessing(df)
    
    # Bigram network analysis
    bigram_freq, bigram_sentiment = create_bigram_network_graph(df)
    
    # TF-IDF analysis
    tfidf_features, tfidf_matrix = create_tfidf_analysis(df)
    
    # Service aspect sentiment
    aspect_results = analyze_service_aspects_sentiment(df)
    
    # Sanitized word clouds
    create_sanitized_wordcloud(df)
    
    # Strategic insights
    generate_strategic_insights(df)
    
    return df, bigram_freq, tfidf_features, aspect_results

In [None]:
## Run NLP
df_nlp, bigram_data, tfidf_data, service_data = execute_advanced_nlp_pipeline(df)

In [None]:
## Save dataset
# df_nlp.to_csv('airline_data_nlp.csv', index=False)