In [None]:
# Upload .tar file for Mahout and Spark on Google Drive
# Mount same Drive account to Google Collab
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!apt-get update -qq
!apt-get install -y -qq openjdk-8-jdk-headless wget tar
!apt install -y pigz > /dev/null

In [None]:
import os
import subprocess
from google.colab import drive

# drive_tar here is path for spark tar file in Google drive (adjust accordingly)
drive_tar = '/content/drive/MyDrive/spark-2.2.0-bin-hadoop2.7.tgz'  # change this
local_tar = '/content/spark-2.2.0-bin-hadoop2.7.tgz'
extract_dir = '/content/spark'

# --- Copy from Drive  ---
os.makedirs(extract_dir, exist_ok=True)
print("Copying from Drive to Colab...")
subprocess.run(["cp", drive_tar, local_tar])

# --- Step 1: Multi-threaded decompression ---
print("Decompressing with 2 threads using pigz...")
subprocess.run(["pigz", "-d", "-p", "2", local_tar])

# --- Step 2: Extract .tar ---
tar_file = local_tar.replace(".gz", "")
print(f"Extracting {tar_file} to {extract_dir}...")
subprocess.run(["tar", "-xf", tar_file, "-C", extract_dir])

print(f"\nâœ… Extraction complete! Files are in: {extract_dir}")


In [None]:
#Basic imports for Shortest Paths
import org.apache.spark._;
import org.apache.spark.graphx._;
import org.apache.spark.rdd.RDD;
import org.apache.spark.graphx.lib.ShortestPaths

#Create Vertex RDD (String type)
  val vertices: RDD[(VertexId, String)] = sc.parallelize(Seq(
    (1L, "A"),
    (2L, "B"),
    (3L, "C"),
    (4L, "D"),
    (5L, "E"),
    (6L, "F"),
    (7L, "G"),
    (8L, "H"),
    (9L, "I"),
    (10L, "J")
  ))

# Create Edge RDD (Integer type  for weighted graph)
  val edges: RDD[Edge[Int]] = sc.parallelize(Seq(
    Edge(1L, 2L, 1), Edge(1L, 3L, 1),
    Edge(2L, 4L, 1), Edge(2L, 5L, 1),
    Edge(3L, 6L, 1), Edge(3L, 7L, 1),
    Edge(4L, 8L, 1), Edge(5L, 8L, 1),
    Edge(6L, 9L, 1), Edge(7L, 9L, 1),
    Edge(8L, 10L, 1), Edge(9L, 10L, 1),
    Edge(5L, 6L, 1), Edge(7L, 4L, 1),
    Edge(9L, 1L, 1)
  ))

#Create Graph
   val graph = Graph(vertices, edges)

#Create Set of Vertex as target
   val landmarks = Seq(1L, 5L)

#Create graph with shortest distance mapping as vertex attributes
   val results = ShortestPaths.run(graph, landmarks)

#Display Result
   results.vertices.collect().foreach(println)

#Save Graphs to view using networkx
  graph.vertices
    .map { case (id, attr) => s"$id,$attr" }
    .coalesce(1)
    .saveAsTextFile("/content/vertices_csv")

  graph.edges
    .map(e => s"${e.srcId},${e.dstId},${e.attr}")
    .coalesce(1)
    .saveAsTextFile("/content/edges_csv")


In [None]:
import os
#Run command to open scala shell
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_HOME'] = '/content/spark/spark-2.2.0-bin-hadoop2.7'
os.environ['PATH'] += f":{os.environ['SPARK_HOME']}/bin"

!spark-shell --master local[*] --driver-memory 2g


#If it does not work run last cell  again
# Run previous cell as practice example

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Load CSV files
vertices = pd.read_csv("/content/vertices_csv/part-00000", names=["id", "name"])
edges = pd.read_csv("/content/edges_csv/part-00000", names=["src", "dst", "relation"])

# Create a directed graph (or use nx.Graph() for undirected)
G = nx.DiGraph()

# Add nodes with attributes
for _, row in vertices.iterrows():
    G.add_node(row['id'], name=row['name'])

# Add edges with attributes
for _, row in edges.iterrows():
    G.add_edge(row['src'], row['dst'], relation=row['relation'])

# Draw the graph
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, labels=nx.get_node_attributes(G, 'name'))
edge_labels = nx.get_edge_attributes(G, 'relation')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
plt.show()


In [None]:
#Delete a folders content as neeeded and all its contents
!rm -rf /content/vertices.csv
