# DATAFLEX ENGINE MODULE


# TABULAR DATA


In [12]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
!pip install vaderSentiment


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m92.2/126.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [15]:
import pandas as pd
import numpy as np
import re
import cv2
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import image
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tabulate import tabulate

# Load dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/Multi Model Selection/posts.csv"
df = pd.read_csv(file_path)
#df = df.head(50)  # Limit for testing

# Convert timestamp to datetime and calculate post age
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', errors='coerce')
df['post_age'] = (datetime.now() - df['timestamp']).dt.days

# Extract text-based features
df['post_length'] = df['content'].fillna('').apply(lambda x: len(x.split()))
df['title_length'] = df['title'].fillna('').apply(lambda x: len(x.split()))

# Vote ratio to avoid division by zero
df['vote_ratio'] = df['upvotes'] / (df['downvotes'] + 1)

# Engagement score
df['engagement_score'] = df[['votes', 'upvotes', 'downvotes']].sum(axis=1)

# Author activity level
author_post_counts = df['author'].value_counts()
df['author_activity'] = df['author'].map(author_post_counts)

# External link detection (if title contains a URL)
df['external_link'] = df['title_link'].fillna('').apply(lambda x: 1 if re.match(r'^http', str(x)) else 0)

# Image presence detection using a Pretrained CNN Model
#cnn_model = load_model('image_classifier.h5')
def check_image_presence(image_path):
    try:
        img = cv2.imread(image_path)
        return 1 if img is not None else 0
    except:
        return 0

#df['image_usage'] = df['image'].fillna('').apply(classify_image)
df['image_usage'] = df['image'].fillna('').apply(check_image_presence)

# Sentiment Analysis using VADER
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    score = analyzer.polarity_scores(str(text))['compound']
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_label'] = df['content'].fillna('').apply(get_sentiment)

# Define trending posts (top 10% engagement)
threshold = df['engagement_score'].quantile(0.9)
df['trending_post'] = (df['engagement_score'] >= threshold).astype(int)

# Post Type Classification
def classify_post_type(row):
    if pd.isna(row['image']) and pd.notna(row['content']):
        return 'text'
    elif pd.isna(row['content']) and pd.notna(row['image']):
        return 'image'
    else:
        return 'both'

df['post_type'] = df.apply(classify_post_type, axis=1)

# Encode categorical variables
le = LabelEncoder()
df['post_type_encoded'] = le.fit_transform(df['post_type'])
df['sentiment_encoded'] = le.fit_transform(df['sentiment_label'])

# Scale numerical features
scaler = MinMaxScaler()
numeric_features = ['post_length', 'title_length', 'vote_ratio', 'engagement_score', 'author_activity', 'post_age']
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Select final features for classification
final_features = numeric_features + ['external_link', 'image_usage', 'sentiment_encoded', 'post_type_encoded']

# Train XGBoost Model for predicting trending posts
X = df[final_features]
y = df['trending_post']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier()
model.fit(X_train, y_train)

# Evaluate the model


# Select required columns for output
output_columns = ['author_activity', 'external_link', 'image_usage', 'post_type', 'sentiment_label',
                  'post_length', 'title_length', 'vote_ratio', 'engagement_score', 'post_age',
                  'trending_post']

# Save to CSV
output_file = "processed_posts.csv"
df[output_columns].to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")


Processed data saved to processed_posts.csv


# GRAPH DATA


In [16]:
import pandas as pd
import networkx as nx
import community
import community.community_louvain as community_louvain  # ✅ Correct import
 # For Louvain community detection

# Load edge dataset
edge_file = "/content/drive/MyDrive/Colab Notebooks/Multi Model Selection/musae_DE_edges.csv"
edges_df = pd.read_csv(edge_file)
#edges_df=edges_df.head(50)
# Convert to string for consistency
edges_df["frpm"] = edges_df["from"].astype(str)
edges_df["to"] = edges_df["to"].astype(str)

# Create a directed graph (use nx.Graph() for undirected)
G = nx.DiGraph()

# Add edges to the graph
G.add_edges_from(zip(edges_df["from"], edges_df["to"]))

# Print basic graph info
print(f"Graph Loaded: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

# Compute network features
features = {}

# 1. Degree (total connections per node)
features["degree"] = dict(G.degree())

# 2. In-degree (incoming connections)
features["in_degree"] = dict(G.in_degree())

# 3. Out-degree (outgoing connections)
features["out_degree"] = dict(G.out_degree())

# 4. Closeness Centrality
#features["closeness_centrality"] = nx.closeness_centrality(G)

# 5. Betweenness Centrality
#features["betweenness_centrality"] = nx.betweenness_centrality(G)

# 6. Eigenvector Centrality
features["eigenvector_centrality"] = nx.eigenvector_centrality(G, max_iter=1000)

# 7. Clustering Coefficient
#features["clustering_coefficient"] = nx.clustering(G)

# 8. PageRank
features["pagerank"] = nx.pagerank(G)




# 11. Graph Density
features["density"] = nx.density(G)

# 12. Number of Connected Components
features["num_connected_components"] = nx.number_strongly_connected_components(G)

# 13. Assortativity (degree correlation)
#features["assortativity"] = nx.degree_assortativity_coefficient(G)

# 14. Community Detection (Louvain)
#partition = community.best_partition(G.to_undirected())
partition = community_louvain.best_partition(G.to_undirected())  # ✅ Corrected version
  # Convert to undirected for Louvain
features["community"] = partition

# 15. K-Core
features["k_core"] = nx.core_number(G)

# Convert to DataFrame
df_features = pd.DataFrame(features)

# Save to CSV
output_file = "network_features.csv"
df_features.to_csv(output_file, index_label="node_id")

print(f"✅ Network features saved to {output_file}")

Graph Loaded: 16995 nodes, 153138 edges
✅ Network features saved to network_features.csv


# IMAGE DATA

In [17]:
import cv2
import numpy as np
import pandas as pd
import requests
import os
from skimage.feature import graycomatrix, graycoprops
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing import image
from io import BytesIO
from PIL import Image

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Multi Model Selection/open-images-dataset-validation.csv")
image_urls = df.iloc[:, 0].tolist()  # Process first 5 images

# Create directory to store images
os.makedirs("images", exist_ok=True)

# Initialize Pretrained CNN Model
vgg_model = VGG16(weights='imagenet', include_top=False, pooling='avg')

def download_image(url, filename):
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            img = Image.open(BytesIO(response.content))
            img.save(filename)
            return filename
    except:
        return None

# Feature Extraction Functions
def extract_metadata(img):
    h, w = img.shape[:2]
    aspect_ratio = w / h
    return h, w, aspect_ratio

def extract_intensity_features(gray_img):
    mean_intensity = np.mean(gray_img)
    std_intensity = np.std(gray_img)
    return mean_intensity, std_intensity

def extract_color_features(img):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    mean_hue = np.mean(hsv[:, :, 0])
    mean_saturation = np.mean(hsv[:, :, 1])
    mean_brightness = np.mean(hsv[:, :, 2])
    return mean_hue, mean_saturation, mean_brightness

def extract_texture_features(gray_img):
    glcm = graycomatrix(gray_img, distances=[1], angles=[0], levels=256, symmetric=True, normed=True)
    contrast = graycoprops(glcm, 'contrast')[0, 0]
    energy = graycoprops(glcm, 'energy')[0, 0]
    return contrast, energy

def extract_shape_features(gray_img):
    edges = cv2.Canny(gray_img, 100, 200)
    edge_density = np.sum(edges) / edges.size
    return edge_density

def extract_line_features(gray_img):
    edges = cv2.Canny(gray_img, 50, 150)
    lines = cv2.HoughLines(edges, 1, np.pi / 180, threshold=100)
    num_lines = len(lines) if lines is not None else 0
    return num_lines

def extract_deep_features(img):
    img = cv2.resize(img, (224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features = vgg_model.predict(img_array).flatten()
    return features[:3]  # Select first 3 principal deep features

# Process images
features_list = []
for idx, url in enumerate(image_urls):
    filename = f"images/image_{idx}.jpg"
    if download_image(url, filename):
        img = cv2.imread(filename)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        height, width, aspect_ratio = extract_metadata(img)
        mean_intensity, std_intensity = extract_intensity_features(gray)
        mean_hue, mean_saturation, mean_brightness = extract_color_features(img)
        contrast, energy = extract_texture_features(gray)
        edge_density = extract_shape_features(gray)
        num_lines = extract_line_features(gray)
        deep_feature1, deep_feature2, deep_feature3 = extract_deep_features(img)

        features_list.append([
            filename, height, width, aspect_ratio, mean_intensity, std_intensity,
            mean_hue, mean_saturation, mean_brightness, contrast, energy,
            edge_density, num_lines, deep_feature1, deep_feature2, deep_feature3
        ])

# Save extracted features
columns = ["Filename", "Height", "Width", "Aspect Ratio", "Mean Intensity", "Std Intensity",
           "Mean Hue", "Mean Saturation", "Mean Brightness", "GLCM Contrast", "GLCM Energy",
           "Edge Density", "Num Lines", "Deep Feature 1(Edge Pattern Representation)", "Deep Feature 2(Texture Representation)", "Deep Feature 3(Semantic Object Representation)"]

df_features = pd.DataFrame(features_list, columns=columns)
df_features.to_csv("image_features_reduced.csv", index=False)
print("Feature extraction completed and saved to image_features_reduced.csv")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 625ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 389ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 540ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 410ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 394ms/step
Feature extraction completed and saved to image_features_reduced.csv


# PDF/DOC DATA

In [17]:
import os
import pandas as pd
import re
from google.colab import drive
from collections import Counter

def mount_google_drive():
    """Mounts Google Drive in Google Colab."""
    drive.mount('/content/drive')

def extract_text_from_txt(txt_path):
    """
    Extracts text from a text file.
    """
    with open(txt_path, "r", encoding="utf-8") as file:
        text = file.read()
    return text

def generate_txt_features(txt_text):
    """
    Generates 10 parameters (features) from the extracted text.
    """
    words = txt_text.split()
    sentences = txt_text.split(". ")
    lines = txt_text.split("\n")
    paragraphs = txt_text.split("\n\n")

    names = len(re.findall(r"\b[A-Z][a-z]+\b", txt_text))
    numbers = len(re.findall(r"\b\d+\b", txt_text))
    verbs = ["is", "am", "are", "was", "were", "be", "been", "have", "has", "had", "do", "does", "did", "run", "ran", "work", "worked", "develop", "developed", "write", "wrote"]
    verb_count = sum(txt_text.lower().count(verb) for verb in verbs)
    bullet_points = len(re.findall(r"•|\*|\-", txt_text))
    capitals = len(re.findall(r"\b[A-Z]{2,}\b", txt_text))
    sections = len(re.findall(r".+:", txt_text))
    dates = len(re.findall(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b", txt_text))
    unique_words = len(set(words))
    special_chars = len(re.findall(r"[!@#$%^&*()_+{}\[\]:;<>,.?~\\/-]", txt_text))
    num_lines = len(lines)

    features = {
        "word_count": len(words),
        "char_count": len(txt_text),
        "sentence_count": len(sentences),
        "avg_word_length": sum(len(word) for word in words) / len(words) if words else 0,
        "avg_sentence_length": len(words) / len(sentences) if sentences else 0,
        "num_names": names,
        "num_numbers": numbers,
        "num_verbs": verb_count,
        "num_bullet_points": bullet_points,
        "num_capitals": capitals,
        "num_sections": sections,
        "num_dates": dates,
        "num_unique_words": unique_words,
        "num_special_chars": special_chars,
        "num_lines": num_lines,
    }
    return features

def process_txt_files(txt_folder, sample_size=20):
    """
    Processes text files from Google Drive and generates a report.
    """
    txt_files = [f for f in os.listdir(txt_folder) if f.endswith(".txt")]
    txt_files = txt_files[:sample_size]

    results = []
    for txt_file in txt_files:
        txt_path = os.path.join(txt_folder, txt_file)
        txt_text = extract_text_from_txt(txt_path)
        txt_features = generate_txt_features(txt_text)
        results.append({
            "file_name": txt_file,
            **txt_features,
        })

    output_path = os.path.join("/content/drive/My Drive/Colab Notebooks/Multi Model Selection", "txt_features.csv")
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_path, index=False)
    print(f"Text features saved to {output_path} (Processed {len(results)} text files)")

# Mount Google Drive and specify folder path
def main():
    mount_google_drive()
    txt_folder = "/content/drive/My Drive/Colab Notebooks/Multi Model Selection/Doc"  # Update with actual folder path
    process_txt_files(txt_folder)

if __name__ == "__main__":
    main()


In [None]:
!pip install PyPDF2 PyDrive2 pandas

In [None]:
from PyPDF2 import PdfReader
import os
import pandas as pd
import re
from google.colab import drive

def mount_drive():
    """Mounts Google Drive in Colab."""
    drive.mount('/content/drive')

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.
    """
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

def generate_pdf_features(pdf_text):
    """
    Generates 10 parameters (features) from the extracted PDF text.
    """
    words = pdf_text.split()
    sentences = pdf_text.split(". ")  # Simple sentence split

    # Feature extraction
    bullet_points = len(re.findall(r"•|\*|-", pdf_text))
    capitals = len(re.findall(r"\b[A-Z]{2,}\b", pdf_text))
    sections = len(re.findall(r".+:", pdf_text))
    dates = len(re.findall(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b", pdf_text))

    features = {
        "total_pages": pdf_text.count("\f") + 1,
        "word_count": len(words),
        "char_count": len(pdf_text),
        "sentence_count": len(sentences),
        "avg_word_length": sum(len(word) for word in words) / len(words) if words else 0,
        "avg_sentence_length": len(words) / len(sentences) if sentences else 0,
        "num_bullet_points": bullet_points,
        "num_capitals": capitals,
        "num_sections": sections,
        "num_dates": dates,
    }
    return features

def process_pdfs(drive_folder_path, sample_size=30):
    """
    Processes a limited number of PDFs from Google Drive and generates a report.
    """
    pdf_files = [f for f in os.listdir(drive_folder_path) if f.endswith(".pdf")]
    pdf_files = pdf_files[:sample_size]
    results = []

    for pdf_file in pdf_files:
        pdf_path = os.path.join(drive_folder_path, pdf_file)
        pdf_text = extract_text_from_pdf(pdf_path)
        pdf_features = generate_pdf_features(pdf_text)
        results.append({"file_name": pdf_file, **pdf_features})

    results_df = pd.DataFrame(results)
    results_df.to_csv("/content/pdf_features.csv", index=False)
    print(f"PDF features saved to /content/pdf_features.csv (Processed {len(results)} PDFs)")

# Example usage in Colab
mount_drive()
drive_folder_path = "/content/drive/MyDrive/Colab Notebooks/Multi Model Selection/pdfs"  # Change 'your_folder' to the actual folder name in Drive
process_pdfs(drive_folder_path)
