[Reference](https://medium.com/@sarowar.saurav10/22-python-code-snippets-every-data-enthusiast-should-know-a7cea27da3aa)

# 1. Forecasting Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

# Step 1: Load and Prepare Data
# Assume 'monthly_sales.csv' has columns 'date' and 'sales'
df = pd.read_csv('monthly_sales.csv', parse_dates=['date'], index_col='date')

# Step 2: Visualize the Data
df['sales'].plot(figsize=(12, 6))
plt.title('Monthly Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.show()

# Step 3: Check for Stationarity (using ADF test)
from statsmodels.tsa.stattools import adfuller

result = adfuller(df['sales'])
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')

# If the p-value is less than 0.05, the time series is stationary
# Otherwise, you might need to difference the data

# Step 4: Fit the SARIMA Model
# SARIMA(p, d, q)(P, D, Q, s)
# where (P, D, Q, s) are the seasonal parameters and s is the length of the seasonal cycle
# Let's assume initial parameters: (1, 1, 1)(1, 1, 1, 12)
model = SARIMAX(df['sales'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
model_fit = model.fit(disp=False)

# Print model summary
print(model_fit.summary())

# Step 5: Make Forecasts
# Forecast the next 12 months
forecast = model_fit.get_forecast(steps=12)
forecast_index = pd.date_range(start=df.index[-1], periods=12, freq='M')
forecast_series = pd.Series(forecast.predicted_mean, index=forecast_index)

# Plot the forecast
plt.figure(figsize=(12, 6))
plt.plot(df['sales'], label='Observed')
plt.plot(forecast_series, label='Forecast', color='red')
plt.title('Sales Forecast')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.show()

# Step 6: Evaluate the Model
# If you have a test set, you can compare the forecast against the actual values
# For this example, let's use the last 12 months as a test set
train = df['sales'][:-12]
test = df['sales'][-12:]

# Fit the model on the training set
model = SARIMAX(train, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
model_fit = model.fit(disp=False)

# Forecast the test set period
forecast = model_fit.get_forecast(steps=12)
forecast_series = pd.Series(forecast.predicted_mean, index=test.index)

# Plot the results
plt.figure(figsize=(12, 6))
plt.plot(train, label='Training')
plt.plot(test, label='Test', color='green')
plt.plot(forecast_series, label='Forecast', color='red')
plt.title('Sales Forecast vs Actual')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.show()

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test, forecast_series))
print(f'Root Mean Squared Error: {rmse}')

# 2. Predicting a Churn

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load Data
df = pd.read_csv('telecom_churn_data.csv')

# Data Cleaning
df.dropna(inplace=True)

# Feature Engineering
df['total_calls'] = df['day_calls'] + df['evening_calls'] + df['night_calls'] + df['international_calls']
df['total_charge'] = df['day_charge'] + df['evening_charge'] + df['night_charge'] + df['international_charge']

# Encoding categorical variables
df = pd.get_dummies(df, columns=['state', 'area_code', 'international_plan', 'voice_mail_plan'])

# Split Data
X = df.drop('churn', axis=1)
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 3. Time Series Forecasting

In [3]:
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt

# Load Data
df = pd.read_csv('monthly_sales.csv', parse_dates=['date'], index_col='date')

# Decompose Time Series
result = seasonal_decompose(df['sales'], model='additive')
result.plot()
plt.show()

# Train ARIMA Model
model = ARIMA(df['sales'], order=(1, 1, 1))
model_fit = model.fit()

# Forecast
forecast = model_fit.forecast(steps=12)
forecast.plot()
plt.show()

# 4. Market Basket Analysis

In [4]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Load Data
df = pd.read_csv('ecommerce_transactions.csv')

# Preprocess Data
basket = (df.groupby(['transaction_id', 'product_name'])['quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('transaction_id'))

def encode_units(x):
    return 1 if x >= 1 else 0

basket = basket.applymap(encode_units)

# Apriori Algorithm
frequent_itemsets = apriori(basket, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Display Rules
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

# 5. Sentiment Analysis on Social Media

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load Data
df = pd.read_csv('social_media_reviews.csv')

# Clean Data
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

df['cleaned_review'] = df['review'].apply(clean_text)

# Preprocess Text
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['cleaned_review']).toarray()
y = df['sentiment']

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 6. Anomaly Detection in Network Traffic

In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

# Load Data
df = pd.read_csv('network_traffic.csv')

# Feature Selection
features = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count']

X = df[features]

# Train Isolation Forest
model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
model.fit(X)

# Predict Anomalies
df['anomaly'] = model.predict(X)

# Visualize Anomalies
anomalies = df[df['anomaly'] == -1]

plt.scatter(df['src_bytes'], df['dst_bytes'], c=df['anomaly'], cmap='coolwarm')
plt.xlabel('Source Bytes')
plt.ylabel('Destination Bytes')
plt.title('Anomaly Detection in Network Traffic')
plt.show()

## Load a CSV File

In [7]:
df = pd.read_csv('data.csv')

## Mutate Strings

In [8]:
# Create a string named str
str = "Rahim and Karim"

# Convert a string to uppercase
str.upper() # 'RAHIM AND KARIM'

# Convert a string to lowercase
str.lower() # 'rahim and karim'

# Convert a string to title case
str.title() # 'Rahim And Karim'

# Replaces matches of a substring with another
str.replace("J", "P") # 'Kahim and Rarim'

'Rahim and Karim'

## Handle Missing Values & Remove Duplicates

In [9]:
df.dropna(inplace=True)  # Drop missing values
df.fillna(df.mean(), inplace=True)  # Fill missing values with mean

df.drop_duplicates(inplace=True)

## Rename Columns

In [10]:
df.rename(columns={'old_name': 'new_name'}, inplace=True)

## Group By and Aggregate

In [11]:
grouped_df = df.groupby('column').agg({'col1': 'sum', 'col2': 'mean'})

## Merge DataFrames

In [12]:
merged_df = pd.merge(df1, df2, on='common_column', how='inner')

## Pivot Table

In [13]:
pivot = df.pivot_table(values='value', index='index', columns='columns', aggfunc='mean')

## Apply Function to Column

In [14]:
df['new_column'] = df['column'].apply(lambda x: x*2)

## Model Evaluation — MSE and R²

In [15]:
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f'MSE: {mse}, R²: {r2}')

## Save Model

In [16]:
import joblib
joblib.dump(model, 'model.pkl')

## Time Series — Convert to Datetime

In [17]:
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

## Rolling Window Calculation

In [18]:
df['rolling_mean'] = df['column'].rolling(window=12).mean()

## Principal Component Analysis (PCA)

In [19]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(df_scaled)

## K-Means Clustering

In [20]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_scaled)
df['cluster'] = kmeans.labels_

## Perform SQL Query on DataFrame

In [21]:
import pandasql as psql
query = "SELECT * FROM df WHERE column > value"
result = psql.sqldf(query, locals())

## Create a Dashboard with Plotly

In [22]:
import plotly.express as px
fig = px.line(df, x='date', y='value')
fig.show()