In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset

sns.set_style('whitegrid')
%matplotlib inline

## Load Dataset

In [12]:
from sklearn.model_selection import train_test_split

dataset = load_dataset("7Xan7der7/us_airline_sentiment")
print("Available splits:", dataset.keys())

# Load the data
df = pd.DataFrame(dataset['train'])

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['airline_sentiment'])

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")
train_df.head()

## Dataset Info

In [13]:
train_df.info()
print("\nMissing values:")
print(train_df.isnull().sum())

## Class Distribution

In [14]:
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
train_df['airline_sentiment'].value_counts().plot(kind='bar')
plt.title('Sentiment Distribution (Train)')
plt.xlabel('Sentiment')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
train_df['airline_sentiment'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Sentiment Distribution')

plt.tight_layout()
plt.show()

print(train_df['airline_sentiment'].value_counts())

## Text Statistics

In [15]:
train_df['text_length'] = train_df['text'].str.len()
train_df['word_count'] = train_df['text'].str.split().str.len()

print("Text Length Statistics:")
print(train_df[['text_length', 'word_count']].describe())

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
train_df['text_length'].hist(bins=50)
plt.title('Text Length Distribution')
plt.xlabel('Length')

plt.subplot(1, 2, 2)
train_df['word_count'].hist(bins=50)
plt.title('Word Count Distribution')
plt.xlabel('Words')

plt.tight_layout()
plt.show()

## Sample Texts by Sentiment

In [16]:
for sentiment in train_df['airline_sentiment'].unique():
    print(f"\n{'='*60}")
    print(f"Sentiment: {sentiment.upper()}")
    print('='*60)
    samples = train_df[train_df['airline_sentiment'] == sentiment]['text'].head(3)
    for i, text in enumerate(samples, 1):
        print(f"{i}. {text}")

## Duplicates Check

In [17]:
duplicates = train_df['text'].duplicated().sum()
print(f"Duplicate texts in train: {duplicates}")
print(f"Percentage: {duplicates/len(train_df)*100:.2f}%")