<a href="https://colab.research.google.com/github/AmmarJamshed/Analytics-and-ML-with-Unstructured-Data-/blob/main/ml_text_image.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Introduction to Image and Text Data

In [None]:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

# Load an image and convert it to grayscale
image = Image.open('path_to_image.jpg').convert('L')
image_array = np.array(image)

# Display the image and its array representation
plt.imshow(image_array, cmap='gray')
plt.show()
print(image_array)

# Image Preprocessing

In [None]:
import cv2
import matplotlib.pyplot as plt

# Load an image
image = cv2.imread('path_to_image.jpg')

# Resize, normalize, and convert to grayscale
resized_image = cv2.resize(image, (128, 128))
normalized_image = resized_image / 255.0  # Normalize pixel values

# Show original and resized image
plt.subplot(1, 2, 1)
plt.title("Original Image")
plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
plt.subplot(1, 2, 2)
plt.title("Resized & Normalized Image")
plt.imshow(normalized_image)
plt.show()

import cv2
import matplotlib.pyplot as plt

# Load an image in grayscale
image = cv2.imread('path_to_image.jpg', 0)

# Plot histogram of pixel intensities
plt.hist(image.ravel(), bins=256, range=[0, 256])
plt.title("Histogram of Pixel Intensities")
plt.show()


# ML Model on Image data

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import cifar10

# Load CIFAR-10 dataset
(X_train, y_train), (X_test, y_test) = cifar10.load_data()
X_train, X_test = X_train / 255.0, X_test / 255.0  # Normalize pixel values

# Define a simple CNN model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Compile and train the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))


## Analyze Image data

In [None]:
import cv2
import matplotlib.pyplot as plt

# Load an image in grayscale
image = cv2.imread('path_to_image.jpg', 0)

# Plot histogram of pixel intensities
plt.hist(image.ravel(), bins=256, range=[0, 256])
plt.title("Histogram of Pixel Intensities")
plt.show()


# Text Representation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text data
texts = ["Data science is amazing.", "Machine learning is powerful.", "Python is great for data analysis."]

# Convert text to numeric vectors
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("Encoded Text:\n", X.toarray())

Vocabulary: ['amazing' 'analysis' 'data' 'for' 'great' 'is' 'learning' 'machine'
 'powerful' 'python' 'science']
Encoded Text:
 [[1 0 1 0 0 1 0 0 0 0 1]
 [0 0 0 0 0 1 1 1 1 0 0]
 [0 1 1 1 1 1 0 0 0 1 0]]


## Analyze text data

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

text = "Data science is amazing. Python is great for data analysis!"

# Tokenization
tokens = word_tokenize(text.lower())

# Removing stop words and lemmatizing
lemmatizer = WordNetLemmatizer()
cleaned_text = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]

print("Original Text:", text)
print("Cleaned Text:", " ".join(cleaned_text))


# Visualize text data

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter

# Sample text data
texts = ["Data science is amazing.", "Machine learning is powerful.", "Python is great for data analysis."]

# Generate word cloud
text = " ".join(texts)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Frequency distribution
counter = Counter(text.split())
print("Word Frequencies:", counter)

## ML with text data

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Sample text data and labels
texts = ["I love data science", "Machine learning is great", "Python is amazing"]
labels = [1, 1, 1]  # Dummy labels for binary classification

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=10)

# Define LSTM model
model = Sequential([
    Embedding(input_dim=100, output_dim=16, input_length=10),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

# Compile and train the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X, labels, epochs=5)


# Integrating with CSV data

In [None]:
import pandas as pd

# Load CSV and image/text data (dummy setup)
csv_data = pd.DataFrame({
    'id': [1, 2, 3],
    'value': [100, 200, 300]
})

# Assume `image_features` and `text_features` are generated from models
image_features = [0.8, 0.6, 0.9]  # Dummy image features
text_features = [0.7, 0.4, 0.5]   # Dummy text features

# Add image and text features to CSV
csv_data['image_features'] = image_features
csv_data['text_features'] = text_features

print(csv_data)
